facebookresearch · yangli5t · Nov 26, 2025
diff --git a/clipeval/eval_all.py b/clipeval/eval_all.py
@@ -16,6 +16,12 @@
     ("slip", "clipeval.slip.eval_slip"),
     ("xm3600", "clipeval.xm3600.eval_xm3600"),
     ("cvqa", "clipeval.cvqa.eval_cvqa"),
+    ("zero_shot_classification_dollar_street", "clipeval.zero_shot_classification.eval_dollar_street"),
+    ("zero_shot_classification_GeoDE", "clipeval.zero_shot_classification.eval_GeoDE"),
+    ("zero_shot_classification_GLDv2", "clipeval.zero_shot_classification.eval_GLDv2"),
+    ("few_shot_geo_localization_dollar_street", "clipeval.few_shot_geo_localization.eval_dollar_street"),
+    ("few_shot_geo_localization_GeoDE", "clipeval.few_shot_geo_localization.eval_GeoDE"),
+    ("few_shot_geo_localization_xm3600", "clipeval.few_shot_geo_localization.eval_xm3600"),
 ]
 
 

diff --git a/clipeval/few_shot_geo_localization/eval_GeoDE.py b/clipeval/few_shot_geo_localization/eval_GeoDE.py
@@ -0,0 +1,105 @@
+import torch
+import json
+from PIL import Image
+from tqdm import tqdm
+import pandas as pd
+import numpy as np
+
+import sys
+if "external/big_vision" not in sys.path:
+    sys.path.append("external/big_vision") 
+# or directly copy the functions from https://github.com/google-research/big_vision/blob/main/big_vision/evaluators/fewshot_lsr.py
+
+from big_vision.evaluators.fewshot_lsr import _precompute_cache, _eig_fewshot_acc_fn
+
+data_dir = 'data/geode/'
+GROUP_KEY = 'ip_country' # 'ip_country'
+
+# Evaluation Function
+def evaluate(model, preprocess_val):
+    geo_df = pd.read_csv(data_dir + 'index.csv')
+    geo_df = geo_df.sample(frac=1).reset_index(drop=True) #shuffle
+    train_df = geo_df.iloc[:20000]
+    test_df = geo_df.iloc[20000:]
+    print("done load data", len(geo_df), len(train_df), len(test_df))
+
+    batch_size = 16
+    device = torch.cuda.current_device()
+
+    ## train classification probe
+    classification_probes = []
+    country_ids_list = [] # each n_shot has a list, theoretically should be the same, but GeoDE is special, some countries are very rare
+    for n_shot in [5, 10, 25]:
+        train_sampled = train_df.groupby(GROUP_KEY, group_keys=False).apply(lambda x: x.sample(n=min(len(x), n_shot), random_state=42))
+        country_ids = sorted(list(set(train_sampled[GROUP_KEY])))
+
+        df = train_sampled
+        with torch.no_grad():
+            all_features = []
+            all_labels = []
+            for start in tqdm(range(0, len(df), batch_size)):
+                end = min(start + batch_size, len(df))
+                batch_imgs = []
+                for i in range(start, end):
+                    data = df.iloc[i]
+                    try:
+                        batch_imgs.append(Image.open(data_dir + 'images/' + data['file_path']).convert("RGB"))
+                        all_labels.append(country_ids.index(data[GROUP_KEY]))
+                    except:
+                        print(f"missing image {data['file_path']}")
+
+                images = torch.stack([preprocess_val(img).to(device) for img in batch_imgs]) 
+                image_embs = model.encode_image(images)
+                image_embs /= image_embs.norm(dim=-1, keepdim=True)
+
+                all_features.append(image_embs)
+
+            all_features = torch.cat(all_features, dim=0)
+            print(all_features.shape)
+
+        classification_probes.append(_precompute_cache(all_features.cpu().numpy(), all_labels, len(set(all_labels))))
+        country_ids_list.append(country_ids)
+
+    ## start eval
+    n = 0
+    correct = [0] * len(classification_probes)
+
+    with torch.no_grad():
+        for local_start in tqdm(range(0, len(test_df), batch_size)):
+            local_end = min(local_start + batch_size, len(test_df))
+            batch_imgs = []
+            country_labels = []
+
+            for i in range(local_start, local_end):
+                data = test_df.iloc[i]
+                try:
+                    batch_imgs.append(Image.open(data_dir + 'images/' + data['file_path']).convert("RGB"))
+                    country_labels.append(data[GROUP_KEY])
+                except:
+                    print(f"missing image {data['file_path']}")
+
+            images = torch.stack([preprocess_val(img).to(device) for img in batch_imgs]) 
+            image_features = model.encode_image(images)
+            image_features /= image_features.norm(dim=-1, keepdim=True)
+
+            for ind, cache in enumerate(classification_probes):
+                labels = [country_ids_list[ind].index(c) if c in country_ids_list[ind] else -1 for c in country_labels]
+                if labels.count(-1) > 0:
+                    print(f"WARNING: there are {labels.count(-1)} out of {len(labels)} samples country are not in the training set.")
+                correct[ind] += _eig_fewshot_acc_fn(cache, image_features.cpu().numpy(), labels, 2.0 ** 10).item()
+
+            n += len(labels)
+
+    print(f"few_shot [5, 10, 25] geo-localization on GeoDE, {correct}, {n}, {np.array(correct)/n}")        
+    return correct, n
+
+def parse_results(results, result_json):
+    with open(result_json) as f:
+        result = json.load(f)
+        print("few-shot geo-localization GeoDE:", result['acc'])
+        results['few_shot_geo_loc_GeoDE'] = result['acc']
+
+def main(model, preprocess_val, tokenizer, result_json):
+    correct, n = evaluate(model, preprocess_val)
+    with open(result_json, "w") as f:
+        json.dump({"correct": correct, "total": n, "acc": (np.array(correct)/n).tolist()}, f)
diff --git a/clipeval/few_shot_geo_localization/eval_dollar_street.py b/clipeval/few_shot_geo_localization/eval_dollar_street.py
@@ -0,0 +1,96 @@
+import torch
+import json
+from PIL import Image
+from tqdm import tqdm
+import pandas as pd
+import numpy as np
+
+import sys
+if "external/big_vision" not in sys.path:
+    sys.path.append("external/big_vision") 
+# or directly copy the functions from https://github.com/google-research/big_vision/blob/main/big_vision/evaluators/fewshot_lsr.py
+
+from big_vision.evaluators.fewshot_lsr import _precompute_cache, _eig_fewshot_acc_fn
+
+
+data_dir = 'data/DollarStreet/dataset_dollarstreet/'
+
+# Evaluation Function
+def evaluate(model, preprocess_val):
+    train_df = pd.read_csv(data_dir + 'images_v2_imagenet_train.csv')
+    test_df = pd.read_csv(data_dir + 'images_v2_imagenet_test.csv')
+    print("done load data", len(train_df), len(test_df))
+
+    batch_size = 16
+    device = torch.cuda.current_device()
+
+    ## train classification probe
+    classification_probes = []
+    country_ids_list = [] # each n_shot has a list, theoretically should be the same, but just in case
+    for n_shot in [5, 10, 25]:
+        train_sampled = train_df.groupby('country.id', group_keys=False).apply(lambda x: x.sample(n=min(len(x), n_shot), random_state=42))
+        country_ids = sorted(list(set(train_sampled['country.id'])))
+
+        df = train_sampled
+        with torch.no_grad():
+            all_features = []
+            all_labels = []
+            for start in tqdm(range(0, len(df), batch_size)):
+                end = min(start + batch_size, len(df))
+                batch_imgs = []
+                for i in range(start, end):
+                    data = df.iloc[i]
+                    batch_imgs.append(Image.open(data_dir + data['imageRelPath']).convert("RGB"))
+                    all_labels.append(country_ids.index(data['country.id']))
+
+
+                images = torch.stack([preprocess_val(img).to(device) for img in batch_imgs]) 
+                image_embs = model.encode_image(images)
+                image_embs /= image_embs.norm(dim=-1, keepdim=True)
+
+                all_features.append(image_embs)
+
+            all_features = torch.cat(all_features, dim=0)
+            print(all_features.shape)
+
+        classification_probes.append(_precompute_cache(all_features.cpu().numpy(), all_labels, len(set(all_labels))))
+        country_ids_list.append(country_ids)
+
+    ## start eval
+    n = 0
+    correct = [0] * len(classification_probes)
+
+    with torch.no_grad():
+        for local_start in tqdm(range(0, len(test_df), batch_size)):
+            local_end = min(local_start + batch_size, len(test_df))
+            batch_imgs = []
+            country_labels = []
+
+            for i in range(local_start, local_end):
+                data = test_df.iloc[i]
+                batch_imgs.append(Image.open(data_dir + data['imageRelPath']).convert("RGB"))
+                country_labels.append(data['country.id'])
+
+            images = torch.stack([preprocess_val(img).to(device) for img in batch_imgs]) 
+            image_features = model.encode_image(images)
+            image_features /= image_features.norm(dim=-1, keepdim=True)
+
+            for ind, cache in enumerate(classification_probes):
+                labels = [country_ids_list[ind].index(c) for c in country_labels]
+                correct[ind] += _eig_fewshot_acc_fn(cache, image_features.cpu().numpy(), labels, 2.0 ** 10).item()
+
+            n += len(labels)
+
+    print(f"few_shot [5, 10, 25] geo-localization on DollarStreet, {correct}, {n}, {np.array(correct)/n}")        
+    return correct, n
+
+def parse_results(results, result_json):
+    with open(result_json) as f:
+        result = json.load(f)
+        print("few-shot geo-localization dollar street:", result['acc'])
+        results['few_shot_geo_loc_dollar_street'] = result['acc']
+
+def main(model, preprocess_val, tokenizer, result_json):
+    correct, n = evaluate(model, preprocess_val)
+    with open(result_json, "w") as f:
+        json.dump({"correct": correct, "total": n, "acc": (np.array(correct)/n).tolist()}, f)
diff --git a/clipeval/few_shot_geo_localization/eval_xm3600.py b/clipeval/few_shot_geo_localization/eval_xm3600.py
@@ -0,0 +1,107 @@
+import torch
+import json
+from PIL import Image
+from tqdm import tqdm
+import pandas as pd
+import numpy as np
+
+import sys
+if "external/big_vision" not in sys.path:
+    sys.path.append("external/big_vision") 
+# or directly copy the functions from https://github.com/google-research/big_vision/blob/main/big_vision/evaluators/fewshot_lsr.py
+
+from big_vision.evaluators.fewshot_lsr import _precompute_cache, _eig_fewshot_acc_fn
+
+data_dir = 'data/XM3600/'
+GROUP_KEY = 'image/locale'
+
+# Evaluation Function
+def evaluate(model, preprocess_val):
+    with open(data_dir + 'captions.jsonl', 'r') as f:
+        data = [{k: v for k, v in json.loads(line).items() if k in ['image/key', 'image/locale']} for line in f]
+    df = pd.DataFrame(data)
+    df = df.sample(frac=1).reset_index(drop=True) #shuffle
+    train_df = df.iloc[:1800]
+    test_df = df.iloc[1800:]
+    print("done load data", len(df), len(train_df), len(test_df))
+
+    batch_size = 16
+    device = torch.cuda.current_device()
+
+    ## train classification probe
+    classification_probes = []
+    country_ids_list = [] # each n_shot has a list, theoretically should be the same, but GeoDE is special, some countries are very rare
+    for n_shot in [5, 10, 25]:
+        train_sampled = train_df.groupby(GROUP_KEY, group_keys=False).apply(lambda x: x.sample(n=min(len(x), n_shot), random_state=42))
+        country_ids = sorted(list(set(train_sampled[GROUP_KEY])))
+
+        df = train_sampled
+        with torch.no_grad():
+            all_features = []
+            all_labels = []
+            for start in tqdm(range(0, len(df), batch_size)):
+                end = min(start + batch_size, len(df))
+                batch_imgs = []
+                for i in range(start, end):
+                    data = df.iloc[i]
+                    try:
+                        batch_imgs.append(Image.open(data_dir + f"images/{data['image/key']}.jpg").convert("RGB"))
+                        all_labels.append(country_ids.index(data[GROUP_KEY]))
+                    except:
+                        print(f"missing image {data['image/key']}")
+
+                images = torch.stack([preprocess_val(img).to(device) for img in batch_imgs]) 
+                image_embs = model.encode_image(images)
+                image_embs /= image_embs.norm(dim=-1, keepdim=True)
+
+                all_features.append(image_embs)
+
+            all_features = torch.cat(all_features, dim=0)
+            print(all_features.shape)
+
+        classification_probes.append(_precompute_cache(all_features.cpu().numpy(), all_labels, len(set(all_labels))))
+        country_ids_list.append(country_ids)
+
+    ## start eval
+    n = 0
+    correct = [0] * len(classification_probes)
+
+    with torch.no_grad():
+        for local_start in tqdm(range(0, len(test_df), batch_size)):
+            local_end = min(local_start + batch_size, len(test_df))
+            batch_imgs = []
+            country_labels = []
+
+            for i in range(local_start, local_end):
+                data = test_df.iloc[i]
+                try:
+                    batch_imgs.append(Image.open(data_dir + f"images/{data['image/key']}.jpg").convert("RGB"))
+                    country_labels.append(data[GROUP_KEY])
+                except:
+                    print(f"missing image {data['image/key']}")
+
+            images = torch.stack([preprocess_val(img).to(device) for img in batch_imgs]) 
+            image_features = model.encode_image(images)
+            image_features /= image_features.norm(dim=-1, keepdim=True)
+
+            for ind, cache in enumerate(classification_probes):
+                labels = [country_ids_list[ind].index(c) if c in country_ids_list[ind] else -1 for c in country_labels]
+                if labels.count(-1) > 0:
+                    print(f"WARNING: there are {labels.count(-1)} out of {len(labels)} samples country are not in the training set.")
+                correct[ind] += _eig_fewshot_acc_fn(cache, image_features.cpu().numpy(), labels, 2.0 ** 10).item()
+
+            n += len(labels)
+
+    print(f"few_shot [5, 10, 25] geo-localization on XM3600, {correct}, {n}, {np.array(correct)/n}")        
+    return correct, n
+
+def parse_results(results, result_json):
+    with open(result_json) as f:
+        result = json.load(f)
+        print("few-shot geo-localization XM3600:", result['acc'])
+        results['few_shot_geo_loc_xm3600'] = result['acc']
+
+def main(model, preprocess_val, tokenizer, result_json):
+    correct, n = evaluate(model, preprocess_val)
+    with open(result_json, "w") as f:
+        json.dump({"correct": correct, "total": n, "acc": (np.array(correct)/n).tolist()}, f)