Skip to content

Commit

Permalink
Refactor code for importing and using external libraries
Browse files Browse the repository at this point in the history
  • Loading branch information
pufanyi committed Jan 16, 2025
1 parent 4cf47ad commit 461114c
Show file tree
Hide file tree
Showing 47 changed files with 354 additions and 532 deletions.
52 changes: 37 additions & 15 deletions lmms_eval/models/gpt4v.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ def __init__(
timeout: int = 120,
continual_mode: bool = False,
response_persistent_folder: str = None,
interleaved: bool = True,
**kwargs,
) -> None:
super().__init__()
Expand All @@ -65,6 +66,7 @@ def __init__(
self.image_token = "<image>"
self.timeout = timeout
self.continual_mode = continual_mode
self.interleaved = interleaved
if self.continual_mode:
if response_persistent_folder is None:
raise ValueError("Continual mode requires a persistent path for the response. Please provide a valid path.")
Expand Down Expand Up @@ -136,6 +138,21 @@ def flatten(self, input):
new_list.append(j)
return new_list

def construct_interleaved_input(self, content, media):
print(content, len(media))
pattern = r"<media_(\d+)>"
parts = re.split(pattern, content)
result = []
for i, part in enumerate(parts):
if i % 2 == 0:
if part == "":
continue
result.append({"type": "text", "text": part})
else:
result.append(media[int(part)])

return result

def generate_until(self, requests) -> List[str]:
res = []
pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding")
Expand Down Expand Up @@ -167,23 +184,28 @@ def generate_until(self, requests) -> List[str]:

response_json = {"role": "user", "content": []}
# When there is no image token in the context, append the image to the text
if self.image_token not in contexts:
payload["messages"].append(deepcopy(response_json))
payload["messages"][0]["content"].append({"type": "text", "text": contexts})
for img in imgs:
payload["messages"][0]["content"].append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img}"}})
else:
contexts = contexts.split(self.image_token)
for idx, img in enumerate(imgs):
if not self.interleaved:
if self.image_token not in contexts:
payload["messages"].append(deepcopy(response_json))
payload["messages"][idx]["content"].append({"type": "text", "text": contexts[idx]})
payload["messages"][idx]["content"].append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img}"}})

# If n image tokens are in the contexts
# contexts will be splitted into n+1 chunks
# Manually add it into the payload
payload["messages"][0]["content"].append({"type": "text", "text": contexts})
for img in imgs:
payload["messages"][0]["content"].append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img}"}})
else:
contexts = contexts.split(self.image_token)
for idx, img in enumerate(imgs):
payload["messages"].append(deepcopy(response_json))
payload["messages"][idx]["content"].append({"type": "text", "text": contexts[idx]})
payload["messages"][idx]["content"].append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img}"}})

# If n image tokens are in the contexts
# contexts will be splitted into n+1 chunks
# Manually add it into the payload
payload["messages"].append(deepcopy(response_json))
payload["messages"][-1]["content"].append({"type": "text", "text": contexts[-1]})
else:
media = [{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img}"}} for img in imgs]
payload["messages"].append(deepcopy(response_json))
payload["messages"][-1]["content"].append({"type": "text", "text": contexts[-1]})
payload["messages"][0]["content"].extend(self.construct_interleaved_input(contexts, media))

if "max_new_tokens" not in gen_kwargs:
gen_kwargs["max_new_tokens"] = 1024
Expand Down
16 changes: 6 additions & 10 deletions lmms_eval/tasks/megabench/breakdown/analysis_utils.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import json
from collections import defaultdict
import os
from collections import defaultdict

# Add path definition at the top after imports
all_task_meta_path = os.path.join(os.path.dirname(__file__), "all_task_meta.json")


def task_list_refine(task_list):
task_results = []
for task in task_list:
Expand Down Expand Up @@ -47,12 +48,7 @@ def derive_keyword_stats(task_results_with_meta, include_per_task_info=False):
if include_per_task_info:
skills_stats[skill]["tasks"].append((task_name, score))

for stat_dict, key in [
(input_format_stats, "input_format"),
(output_format_stats, "output_format"),
(input_num_stats, "num_input"),
(app_stats, "app")
]:
for stat_dict, key in [(input_format_stats, "input_format"), (output_format_stats, "output_format"), (input_num_stats, "num_input"), (app_stats, "app")]:
if value := task.get(key):
stat_dict[value]["count"] += 1
stat_dict[value]["total_score"] += score
Expand Down Expand Up @@ -83,16 +79,16 @@ def collect_task_metadata(model_results):
# Load the complete task metadata
with open(all_task_meta_path, "r") as f:
all_meta = json.load(f)

# Create result dictionary
all_task_meta = {}

# Match results with metadata
for task_result in model_results:
task_name = task_result["name"]
if task_name in all_meta:
meta = all_meta[task_name].copy() # Create a copy to avoid modifying original
meta.update(task_result)
all_task_meta[task_name] = meta

return all_task_meta
83 changes: 36 additions & 47 deletions lmms_eval/tasks/megabench/breakdown/derive_breakdown_results.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,18 @@
import json
import argparse
import json
from pathlib import Path
from analysis_utils import (
task_list_refine,
collect_task_metadata,
derive_keyword_stats,
)

from analysis_utils import collect_task_metadata, derive_keyword_stats, task_list_refine


def calculate_model_summary(task_results_with_meta):
"""
Re-calculate model performance summary statistics across core and open tasks.
Args:
task_results: List of task results with scores
task_metadata: Dictionary containing task metadata including task types
Returns:
Dictionary containing summary statistics for core and open tasks
"""
Expand All @@ -23,27 +21,27 @@ def calculate_model_summary(task_results_with_meta):

# Separate core and open tasks
for task in task_results_with_meta.values():
if task['eval_type'] == 'llm':
if task["eval_type"] == "llm":
open_tasks.append(task)
else:
core_tasks.append(task)

def calculate_stats(tasks):
if not tasks:
return None
total_samples = sum(task.get('num_query', 0) for task in tasks)
macro_scores = [task.get('score', 0) for task in tasks]

total_samples = sum(task.get("num_query", 0) for task in tasks)
macro_scores = [task.get("score", 0) for task in tasks]

return {
"num_eval_tasks": len(tasks),
"num_eval_samples": total_samples,
"macro_mean_score": sum(macro_scores) / len(tasks) if tasks else 0,
}

core_stats = calculate_stats(core_tasks)
open_stats = calculate_stats(open_tasks)

# Calculate overall score (weighted average based on number of tasks)
# If either stat is None, use only the available stat
if core_stats is None:
Expand All @@ -53,17 +51,11 @@ def calculate_stats(tasks):
overall_score = core_stats["macro_mean_score"] if core_stats else 0
total_tasks = core_stats["num_eval_tasks"] if core_stats else 0
else:
total_tasks = (core_stats["num_eval_tasks"] + open_stats["num_eval_tasks"])
overall_score = (
(core_stats["macro_mean_score"] * core_stats["num_eval_tasks"] +
open_stats["macro_mean_score"] * open_stats["num_eval_tasks"]) / total_tasks
)

return {
"core": core_stats,
"open": open_stats,
"overall_score": overall_score
}
total_tasks = core_stats["num_eval_tasks"] + open_stats["num_eval_tasks"]
overall_score = (core_stats["macro_mean_score"] * core_stats["num_eval_tasks"] + open_stats["macro_mean_score"] * open_stats["num_eval_tasks"]) / total_tasks

return {"core": core_stats, "open": open_stats, "overall_score": overall_score}


def merge_json_files(input_dir, output_path, key="name"):
"""
Expand All @@ -72,76 +64,73 @@ def merge_json_files(input_dir, output_path, key="name"):
Prioritizes LLM evaluations over rule-based ones when duplicates exist.
"""
data_dict = {} # Using name as key for easy lookup and updates

# Find all matching JSON files in the directory
json_paths = list(Path(input_dir).glob("megabench*data_with_scores*.json"))
print(f"Found {len(json_paths)} files to merge")

# Load and merge all JSON files
for path in json_paths:
print(f"Processing {path}")
with open(path, "r") as f:
data = json.load(f)
if isinstance(data, dict) and "data" in data:
data = task_list_refine(data["data"])

# Update or add entries
for item in data:
item_key = item[key]
# If new item or if new item is LLM-evaluated (prioritize LLM eval)
if item_key not in data_dict or (
item.get("eval_type") == "llm" and data_dict[item_key].get("eval_type") != "llm"
):
if item_key not in data_dict or (item.get("eval_type") == "llm" and data_dict[item_key].get("eval_type") != "llm"):
data_dict[item_key] = item

# Convert back to list
merged_data = list(data_dict.values())

# Save the merged result
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, "w") as f:
json.dump(merged_data, f, indent=4)

print(f"Merged file with {len(merged_data)} tasks saved to {output_path}")
return merged_data


def main():
# Parse command line arguments
parser = argparse.ArgumentParser(description='Merge and process evaluation score files.')
parser.add_argument('--input_dir', type=str, help='Directory containing score files')
parser = argparse.ArgumentParser(description="Merge and process evaluation score files.")
parser.add_argument("--input_dir", type=str, help="Directory containing score files")
args = parser.parse_args()

# Convert path to Path object
input_dir = Path(args.input_dir)

# Create analysis directory under input directory
output_dir = input_dir / "analysis"
output_dir.mkdir(parents=True, exist_ok=True)

# Merge files
output_path = output_dir / "task_results.json"
task_results = merge_json_files(input_dir, output_path)

# Collect metadata and derive keyword stats
task_results_with_meta = collect_task_metadata(task_results)
keyword_stats = derive_keyword_stats(task_results_with_meta)

# Calculate model summary
model_summary = calculate_model_summary(task_results_with_meta)

summary_results = {
"model_summary": model_summary,
"keyword_stats": keyword_stats
}

summary_results = {"model_summary": model_summary, "keyword_stats": keyword_stats}

# Save keyword stats
stats_output = output_dir / "summary_and_keyword_stats.json"
with open(stats_output, "w") as f:
json.dump(summary_results, f, indent=4)

print(f"\nResults saved in {output_dir}:")
print(f"- Merged data: {output_path}")
print(f"- Multi-dimensional keywords stats: {stats_output}")


if __name__ == "__main__":
main()
Loading

0 comments on commit 461114c

Please sign in to comment.