diff --git a/eval/humaneval.py b/eval/humaneval.py
index ca3ca008..24fc952f 100644
--- a/eval/humaneval.py
+++ b/eval/humaneval.py
@@ -1,27 +1,33 @@
 from __future__ import annotations
-import sys, os
+import sys
+import os
+import argparse
+import subprocess
+import torch
+
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from human_eval.data import write_jsonl, read_problems
 from exllamav2 import model_init
 from exllamav2 import ExLlamaV2Cache, ExLlamaV2Cache_Q4, ExLlamaV2Cache_Q6, ExLlamaV2Cache_Q8
 from exllamav2.generator import ExLlamaV2DynamicGenerator, ExLlamaV2DynamicJob, ExLlamaV2Sampler
-import argparse, contextlib, subprocess
 import util
 
 # Args
 
-parser = argparse.ArgumentParser(description = "Run HumanEval evaluation on EXL2 model")
-parser.add_argument("-o", "--output", type = str, help = "Output .jsonl filename", required = True)
-parser.add_argument("-cs", "--cache_size", type = int, default = None)
-parser.add_argument("-spt", "--samples_per_task", type = int, default = 200)
-parser.add_argument("-cq4", "--cache_q4", action = "store_true", help = "Use Q4 cache")
-parser.add_argument("-cq6", "--cache_q6", action = "store_true", help = "Use Q6 cache")
-parser.add_argument("-cq8", "--cache_q8", action = "store_true", help = "Use Q8 cache")
-parser.add_argument("--max_tokens", type = int, default = 768, help = "Max number of tokens for each completion")
-parser.add_argument("-pf", "--prompt_format", type = str, help = "Instruct format to apply. Default is raw completion (for base models) ")
-parser.add_argument("-v", "--verbose", action = "store_true", help = "Spam completions to console while generating")
-parser.add_argument("-e", "--eval", action = "store_true", help = "Run evaluation script on output file after sampling")
-parser.add_argument("-temp", "--temperature", type = float, help = "Sampling temperature (0 for greedy), default: 0.6")
+parser = argparse.ArgumentParser(description="Run HumanEval evaluation on EXL2 model")
+parser.add_argument("-o", "--output", type=str, help="Output .jsonl filename", required=True)
+parser.add_argument("-cs", "--cache_size", type=int, default=None)
+parser.add_argument("-spt", "--samples_per_task", type=int, default=200)
+parser.add_argument("-cq4", "--cache_q4", action="store_true", help="Use Q4 cache")
+parser.add_argument("-cq6", "--cache_q6", action="store_true", help="Use Q6 cache")
+parser.add_argument("-cq8", "--cache_q8", action="store_true", help="Use Q8 cache")
+parser.add_argument("--max_tokens", type=int, default=768, help="Max number of tokens for each completion")
+parser.add_argument("-pf", "--prompt_format", type=str,
+                    help="Instruct format to apply. Default is raw completion (for base models)")
+parser.add_argument("-v", "--verbose", action="store_true", help="Spam completions to console while generating")
+parser.add_argument("-e", "--eval", action="store_true", help="Run evaluation script on output file after sampling")
+parser.add_argument("-temp", "--temperature", type=float, default=0.6, help="Sampling temperature (0 for greedy)")
+parser.add_argument("-bs", "--batch_size", type=int, default=50, help="Number of problems to process in each batch")
 model_init.add_args(parser)
 args = parser.parse_args()
 
@@ -37,38 +43,27 @@
 # Prompt formats
 
 prompt_formats = {
-    "raw": (
-        "```python\n{{problem}}    ",
-        "    "
-    ),
+    "raw": ("```python\n{{problem}}    ", "    "),
     "granite": (
         "Question:\nComplete the following Python function:\n\n{{problem}}\n\nAnswer:\n"
         "Sure! Here is how you might implement the function:\n\n```python\n{{problem}}",
         "    "
     ),
     "llama": (
-        "[INST] <<SYS>>\n"
-        "You are a helpful AI coding assistant.\n"
-        "<</SYS>>\n\n"
-        "Complete the following Python function:\n\n"
-        "{{problem}} [/INST] "
+        "[INST] <<SYS>>\nYou are a helpful AI coding assistant.\n<</SYS>>\n\n"
+        "Complete the following Python function:\n\n{{problem}} [/INST] "
         "Sure! Here is how you might implement the function:\n\n```python\n{{problem}}",
         "    "
     ),
     "llama3": (
-        "<|start_header_id|>system<|end_header_id|>\n\n"
-        "You are a helpful AI coding assistant.<|eot_id|>"
-        "<|start_header_id|>user<|end_header_id|>\n\n"
-        "Complete the following Python function:\n\n{{problem}}<|eot_id|>"
-        "<|start_header_id|>assistant<|end_header_id|>\n\n"
-        "Sure! Here is how you might implement the function:\n\n```python\n{{problem}}",
+        "<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful AI coding assistant.<|eot_id|>"
+        "<|start_header_id|>user<|end_header_id|>\n\nComplete the following Python function:\n\n{{problem}}<|eot_id|>"
+        "<|start_header_id|>assistant<|end_header_id|>\n\nSure! Here is how you might implement the function:\n\n```python\n{{problem}}",
         "    "
     ),
     "gemma": (
-        "<bos><start_of_turn>user\n"
-        "Complete the following Python function:\n\n{{problem}}<|eot_id|>"
-        "<start_of_turn>model\n"
-        "```python\n{{problem}}",
+        "<bos><start_of_turn>user\nComplete the following Python function:\n\n{{problem}}<|eot_id|>"
+        "<start_of_turn>model\n```python\n{{problem}}",
         "    "
     )
 }
@@ -88,93 +83,71 @@
 model_init.print_options(args)
 model, tokenizer = model_init.init(
     args,
-    allow_auto_split = True,
-    progress = True,
-    max_output_len = 4,
-    max_input_len = 2048
+    allow_auto_split=True,
+    progress=True,
+    max_output_len=4,
+    max_input_len=2048
 )
 
-if args.cache_q4: cache_type = ExLlamaV2Cache_Q4
-elif args.cache_q6: cache_type = ExLlamaV2Cache_Q6
-elif args.cache_q8: cache_type = ExLlamaV2Cache_Q8
-else: cache_type = ExLlamaV2Cache
+if args.cache_q4:
+    cache_type = ExLlamaV2Cache_Q4
+elif args.cache_q6:
+    cache_type = ExLlamaV2Cache_Q6
+elif args.cache_q8:
+    cache_type = ExLlamaV2Cache_Q8
+else:
+    cache_type = ExLlamaV2Cache
 cache = cache_type(
     model,
-    lazy = not model.loaded,
-    max_seq_len = args.cache_size or model.config.max_seq_len
+    lazy=not model.loaded,
+    max_seq_len=args.cache_size or model.config.max_seq_len
 )
 
 if not model.loaded:
-    model.load_autosplit(cache, progress = True)
+    model.load_autosplit(cache, progress=True)
 
 # Generator
 
 generator = ExLlamaV2DynamicGenerator(
-    model = model,
-    cache = cache,
-    tokenizer = tokenizer,
-    max_batch_size = 256,
-    max_q_size = 4
+    model=model,
+    cache=cache,
+    tokenizer=tokenizer,
+    max_batch_size=256,
+    max_q_size=4
 )
 
 gen_settings = ExLlamaV2Sampler.Settings(
-    token_repetition_penalty = 1.0,
-    temperature = 0.6,
-    top_k = 50,
-    top_p = 0.6
+    token_repetition_penalty=1.0,
+    temperature=args.temperature,
+    top_k=50,
+    top_p=0.6
 )
 
-# Get problems
-
-problems = read_problems()
-num_samples_per_task = args.samples_per_task
-
-# Create jobs
 
-with util.get_progress() as progress:
-
-    task1 = progress.add_task("[red]Sample", total = len(problems) * num_samples_per_task, name = "Creating sample jobs")
-    for problem_id, problem in problems.items():
+def process_batch(batch_problems, batch_size, progress, sample_task, generate_task):
+    samples = []
 
+    for problem_id, problem in batch_problems.items():
         b_problem = problem["prompt"]
         f_problem = prompt_format.replace("{{problem}}", b_problem)
         input_ids = tokenizer.encode(f_problem, encode_special_tokens=True, add_bos=True)
 
-        for s in range(num_samples_per_task):
-
+        for s in range(batch_size):
             job = ExLlamaV2DynamicJob(
-                input_ids = input_ids,
-                gen_settings = gen_settings,
-                max_new_tokens = args.max_tokens,
-                stop_conditions = [tokenizer.eos_token_id],
-                token_healing = True,
-                identifier = (problem_id, s),
-                min_new_tokens = 6
+                input_ids=input_ids,
+                gen_settings=gen_settings,
+                max_new_tokens=args.max_tokens,
+                stop_conditions=[tokenizer.eos_token_id],
+                token_healing=True,
+                identifier=(problem_id, s),
+                min_new_tokens=6
             )
-
             generator.enqueue(job)
-            progress.update(task1, advance = 1)
-
-# Collect samples here
-
-samples = []
-
-# Work
-
-total_jobs = generator.num_remaining_jobs()
-cm = contextlib.nullcontext() if args.verbose else util.get_progress()
-with cm as progress:
-
-    if not args.verbose:
-        task1 = progress.add_task("[red]Sample", total = total_jobs, name = "Generating samples")
+            progress.update(sample_task, advance=1)
 
     while generator.num_remaining_jobs():
-
         results = generator.iterate()
         for result in results:
-
-            # End sample if generator says EOS or if there is a non-indented line at the end of the output
-
             job = result["job"]
             eos = False
             completion = job.full_completion
@@ -186,32 +159,49 @@
                     eos = True
             eos = eos or result["eos"]
 
-            # Collect completed sample
-
             if eos:
                 identifier = result["identifier"]
-                sample = problems[identifier[0]]["prompt"] + prefix + completion.strip()
+                sample = batch_problems[identifier[0]]["prompt"] + prefix + completion.strip()
                 if not result["eos"]:
                     generator.cancel(job)
 
                 if args.verbose:
                     print("----------------------------------------------------------------------")
-                    print(f" ** Problem {identifier[0]}, sample {identifier[1] + 1} / {num_samples_per_task}")
+                    print(f" ** Problem {identifier[0]}, sample {identifier[1] + 1} / {batch_size}")
                     print("----------------------------------------------------------------------")
                     print(sample)
                     print()
                 else:
-                    progress.update(task1, advance = 1)
+                    progress.update(generate_task, advance=1)
 
-                samples.append(dict(task_id = identifier[0], completion = prefix + completion.strip()))
+                samples.append(dict(task_id=identifier[0], completion=prefix + completion.strip()))
 
-# Save output
+    return samples
+
+
+# Main execution
+problems = read_problems()
+all_samples = []
+batch_size = args.batch_size
+total_samples = len(problems) * args.samples_per_task
+
+with util.get_progress() as progress:
+    sample_task = progress.add_task("[red]Sample", total=total_samples, name="Creating sample jobs")
+    generate_task = progress.add_task("[green]Sample", total=total_samples, name="Generating samples")
+
+    for i in range(0, len(problems), batch_size):
+        batch_problems = dict(list(problems.items())[i:i + batch_size])
+        batch_samples = process_batch(batch_problems, args.samples_per_task, progress, sample_task, generate_task)
+        all_samples.extend(batch_samples)
 
+        # Optional: Clear CUDA cache to free up memory
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+
+# Save output
 print(f" -- Saving: {args.output}")
-write_jsonl(args.output, samples)
+write_jsonl(args.output, all_samples)
 
 # Optionally launch eval script
-
 if args.eval:
-    subprocess.run(["evaluate_functional_correctness", args.output])
-
+    subprocess.run(["evaluate_functional_correctness", args.output])
\ No newline at end of file
diff --git a/eval/mmlu.py b/eval/mmlu.py
index 0d3f7d8a..0b256b85 100644
--- a/eval/mmlu.py
+++ b/eval/mmlu.py
@@ -1,197 +1,187 @@
 from __future__ import annotations
-import sys, os
-sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-from exllamav2 import model_init
-from exllamav2 import ExLlamaV2Cache, ExLlamaV2Cache_Q4, ExLlamaV2Cache_Q6, ExLlamaV2Cache_Q8
-from exllamav2.generator import ExLlamaV2DynamicGenerator, ExLlamaV2DynamicJob, ExLlamaV2Sampler
-import argparse, contextlib
-import torch
+import sys, argparse, random, torch
 import util
-import random
-
-# Args
-
-parser = argparse.ArgumentParser(description = "Run MMLU evaluation on EXL2 model")
-parser.add_argument("-cs", "--cache_size", type = int, default = None)
-parser.add_argument("-cq4", "--cache_q4", action = "store_true", help = "Use Q4 cache")
-parser.add_argument("-cq6", "--cache_q6", action = "store_true", help = "Use Q6 cache")
-parser.add_argument("-cq8", "--cache_q8", action = "store_true", help = "Use Q8 cache")
-parser.add_argument("-sub", "--subjects", type = str, default = "all", help = "Comma-separated list of categories to test, or 'all'")
-parser.add_argument("-fs", "--fewshot_examples", type = int, default = 5, help = "Number of examples for fewshot examples, max 5")
-parser.add_argument("-shf", "--shuffle", action = "store_true", help = "Shuffle choices randomly")
+from exllamav2 import model_init, ExLlamaV2Cache, ExLlamaV2Cache_Q4, ExLlamaV2Cache_Q6, ExLlamaV2Cache_Q8
+from exllamav2.generator import ExLlamaV2DynamicGenerator, ExLlamaV2DynamicJob, ExLlamaV2Sampler
+from rich.progress import Progress, TextColumn, BarColumn, TaskProgressColumn
+from collections import defaultdict
+
+# Argument Parsing
+parser = argparse.ArgumentParser(description="Run MMLU evaluation on EXL2 model")
+parser.add_argument("-cs", "--cache_size", type=int, default=None)
+parser.add_argument("-cq4", "--cache_q4", action="store_true", help="Use Q4 cache")
+parser.add_argument("-cq6", "--cache_q6", action="store_true", help="Use Q6 cache")
+parser.add_argument("-cq8", "--cache_q8", action="store_true", help="Use Q8 cache")
+parser.add_argument("-sub", "--subjects", type=str, default="all", help="Comma-separated list of categories to test, or 'all'")
+parser.add_argument("-fs", "--fewshot_examples", type=int, default=5, help="Number of examples for fewshot examples, max 5")
+parser.add_argument("-shf", "--shuffle", action="store_true", help="Shuffle choices randomly")
+parser.add_argument("-bs", "--batch_size", type=int, default=128, help="Number of problems to process in each batch. Decrease to prevent potential OOM errors")
 model_init.add_args(parser)
 args = parser.parse_args()
 
-# Init model and cache
-
-model_init.check_args(args)
-model_init.print_options(args)
-model, tokenizer = model_init.init(
-    args,
-    allow_auto_split = True,
-    progress = True,
-    max_output_len = 1,
-    max_input_len = 2048
-)
-
-if args.cache_q4: cache_type = ExLlamaV2Cache_Q4
-elif args.cache_q6: cache_type = ExLlamaV2Cache_Q6
-elif args.cache_q8: cache_type = ExLlamaV2Cache_Q8
-else: cache_type = ExLlamaV2Cache
-cache = cache_type(
-    model,
-    lazy = not model.loaded,
-    max_seq_len = args.cache_size or model.config.max_seq_len
-)
-
-if not model.loaded:
-    model.load_autosplit(cache, progress = True)
-
-# Generator
-
-generator = ExLlamaV2DynamicGenerator(
-    model = model,
-    cache = cache,
-    tokenizer = tokenizer,
-    max_batch_size = 1024,
-    max_q_size = 1
-)
-
-c_options = "ABCD"
-
-gen_settings = ExLlamaV2Sampler.Settings(
-    token_repetition_penalty = 1.0,
-    temperature = 1.0,
-    top_k = 10,
-    top_p = 1.0,
-)
-
-token_map = [tokenizer.single_id(piece) for piece in [" " + c for c in c_options]]
-token_rmap = { token_map[i]: i for i in range(len(c_options)) }
-gen_settings.allow_tokens(tokenizer, token_map)
-
-# Get dataset
-
-dataset_dev = util.get_dataset("cais/mmlu", "all", "dev")
-dataset_all = util.get_dataset("cais/mmlu", "all", "test")
-dataset_dev = sorted(dataset_dev, key = lambda q: q["subject"])
-dataset_all = sorted(dataset_all, key = lambda q: q["subject"])
-
-all_subjects = set([q["subject"] for q in dataset_dev])
-if args.subjects != "all":
-    sel_subjects = args.subjects.split(",")
-    for s in sel_subjects:
-        if s not in all_subjects:
-            print(f"Subject: {s} is not present in dataset")
-            sys.exit()
-    all_subjects = set(sel_subjects)
-
-# Optionally shuffle
-
-if args.shuffle:
-    for problem in dataset_all:
-        if problem["subject"] in all_subjects:
-            perm = random.sample(range(4), k = 4)
-            problem["choices"] = [problem["choices"][i] for i in perm]
-            problem["answer"] = perm.index(problem["answer"])
-
-# Format
-
+# Model and Cache Initialization
+def initialize_model_and_cache(args):
+    try:
+        model_init.check_args(args)
+        model_init.print_options(args)
+        model, tokenizer = model_init.init(args, allow_auto_split=True, progress=True, max_output_len=1, max_input_len=2048)
+    except Exception as e:
+        print(f"Error initializing model: {e}")
+        sys.exit(1)
+
+    cache_type = {
+        "q4": ExLlamaV2Cache_Q4,
+        "q6": ExLlamaV2Cache_Q6,
+        "q8": ExLlamaV2Cache_Q8
+    }.get(next((k[6:] for k, v in vars(args).items() if k.startswith('cache_q') and v), None), ExLlamaV2Cache)
+
+    cache = cache_type(model, lazy=not model.loaded, max_seq_len=args.cache_size or model.config.max_seq_len)
+    if not model.loaded:
+        model.load_autosplit(cache, progress=True)
+
+    return model, tokenizer, cache
+
+# Dataset Loading and Preparation
+def load_and_prepare_datasets(args):
+    try:
+        dataset_dev = sorted(util.get_dataset("cais/mmlu", "all", "dev"), key=lambda q: q["subject"])
+        dataset_all = sorted(util.get_dataset("cais/mmlu", "all", "test"), key=lambda q: q["subject"])
+    except Exception as e:
+        print(f"Error loading datasets: {e}")
+        sys.exit(1)
+
+    all_subjects = set(q["subject"] for q in dataset_dev)
+    if args.subjects != "all":
+        sel_subjects = set(args.subjects.split(","))
+        invalid_subjects = sel_subjects - all_subjects
+        if invalid_subjects:
+            print(f"Subjects not present in dataset: {', '.join(invalid_subjects)}")
+            sys.exit(1)
+        all_subjects = sel_subjects
+
+    if args.shuffle:
+        for problem in dataset_all:
+            if problem["subject"] in all_subjects:
+                perm = random.sample(range(4), k=4)
+                problem["choices"] = [problem["choices"][i] for i in perm]
+                problem["answer"] = perm.index(problem["answer"])
+
+    return dataset_dev, dataset_all, all_subjects
+
+# Question Formatting
 def format_question(question: str, choices: list[str], answer: int | None):
-    f = question + "\n"
-    for i, c in enumerate(c_options):
-        f += c + ". " + choices[i] + "\n"
-    f += "Answer:"
-    if answer is not None:
-        f += " " + c_options[answer] + "\n\n"
+    f = question + "\n" + "\n".join(f"{c}. {choices[i]}" for i, c in enumerate("ABCD")) + "\nAnswer:"
+    if answer is not None: f += f" {'ABCD'[answer]}\n\n"
     return f
 
-# Fewshot preprompts
-
-preprompt_ids = {}
-with util.get_progress() as progress:
-    task1 = progress.add_task("[red]Preprompts", total = len(all_subjects), name = "Preparing preprompts")
+# Preprompt Preparation
+def prepare_preprompts(all_subjects, dataset_dev, tokenizer, args, progress, task_id):
+    preprompt_ids = {}
     for subject in all_subjects:
-
         preprompt = f"The following are multiple choice questions (with answers) about {subject.replace('_', ' ')}.\n\n"
         fewshots = 0
         for pq in dataset_dev:
             if fewshots == args.fewshot_examples: break
             if pq["subject"] != subject: continue
             preprompt += format_question(pq["question"], pq["choices"], pq["answer"])
-        preprompt_ids[subject] = tokenizer.encode(preprompt, add_bos = True)
-        progress.update(task1, advance = 1)
-
-# Questions
-
-total_jobs = 0
-for q in dataset_all:
-    if q["subject"] in all_subjects:
-        total_jobs += 1
-
-with util.get_progress() as progress:
-    task1 = progress.add_task("[red]Questions", total=total_jobs, name="Preparing questions")
+        preprompt_ids[subject] = tokenizer.encode(preprompt, add_bos=True)
+        progress.update(task_id, advance=1)
+    return preprompt_ids
 
+# Job Preparation
+def prepare_jobs(dataset_all, all_subjects, preprompt_ids, tokenizer, gen_settings, batch_size, progress, task_id):
+    jobs = []
     for q in dataset_all:
-        if q["subject"] not in all_subjects:
-            continue
-
+        if q["subject"] not in all_subjects: continue
         prompt = format_question(q["question"], q["choices"], None)
-        prompt_ids = tokenizer.encode(prompt, add_bos = False)
-
+        prompt_ids = tokenizer.encode(prompt, add_bos=False)
         job = ExLlamaV2DynamicJob(
-            input_ids = torch.cat([preprompt_ids[q["subject"]], prompt_ids], dim = -1),
-            gen_settings = gen_settings,
-            max_new_tokens = 1,
-            return_top_tokens = 4,
-            identifier = q,
+            input_ids=torch.cat([preprompt_ids[q["subject"]], prompt_ids], dim=-1),
+            gen_settings=gen_settings,
+            max_new_tokens=1,
+            return_top_tokens=4,
+            identifier=q,
         )
+        jobs.append(job)
+        progress.update(task_id, advance=1)
 
-        generator.enqueue(job)
-        progress.update(task1, advance = 1)
+        if len(jobs) == batch_size:
+            yield jobs
+            jobs = []
 
-# Work
+    if jobs:
+        yield jobs
 
-with util.get_progress() as progress:
-    task1 = progress.add_task("[red]Sample", total = total_jobs, name = "Testing")
+# Batch Processing
+def process_batch(generator, job_batch, token_map, token_rmap, progress, task_id):
+    for job in job_batch:
+        generator.enqueue(job)
 
     while generator.num_remaining_jobs():
-
         results = generator.iterate()
         for result in results:
-
-            if not result["eos"]:
-                continue
-
-            # Ignore completion and use top-K tokens only
-
-            top_tokens = result["top_k_tokens"]
-            top_probs = result["top_k_probs"]
-            q = result["identifier"]
-
+            if not result["eos"]: continue
+            top_tokens, top_probs, q = result["top_k_tokens"], result["top_k_probs"], result["identifier"]
             correct_answer = q["answer"]
             for i in range(top_tokens.shape[-1]):
                 if top_tokens[0, 0, i].item() == token_map[correct_answer]:
                     confidence = top_probs[0, 0, i].item()
-
             q["correct_answer_confidence"] = confidence
             q["answer_correct"] = token_rmap[top_tokens[0, 0, 0].item()] == correct_answer
+            progress.update(task_id, advance=1)
+
+# Result Summarization
+def summarize_results(dataset_all):
+    results = defaultdict(lambda: {"total": 0, "correct": 0, "confidence_sum": 0})
+
+    for q in dataset_all:
+        if "answer_correct" not in q:
+            continue
+        subject = q["subject"]
+        results[subject]["total"] += 1
+        results["overall"]["total"] += 1
+        if q["answer_correct"]:
+            results[subject]["correct"] += 1
+            results["overall"]["correct"] += 1
+        results[subject]["confidence_sum"] += q["correct_answer_confidence"]
+        results["overall"]["confidence_sum"] += q["correct_answer_confidence"]
+
+    print("\nResults:")
+    print(f"{'Subject':<30} {'Accuracy':<10} {'Confidence':<10}")
+    print("-" * 50)
+    for subject, data in sorted(results.items()):
+        if subject != "overall":
+            acc = data["correct"] / data["total"] * 100
+            conf = data["confidence_sum"] / data["total"] * 100
+            print(f"{subject:<30} {acc:.2f}%      {conf:.2f}%")
+
+    print("-" * 50)
+    overall = results["overall"]
+    overall_acc = overall["correct"] / overall["total"] * 100
+    overall_conf = overall["confidence_sum"] / overall["total"] * 100
+    print(f"{'Overall':<30} {overall_acc:.2f}%      {overall_conf:.2f}%")
+
+# Main Execution
+model, tokenizer, cache = initialize_model_and_cache(args)
+dataset_dev, dataset_all, all_subjects = load_and_prepare_datasets(args)
+
+generator = ExLlamaV2DynamicGenerator(model=model, cache=cache, tokenizer=tokenizer, max_batch_size=1024, max_q_size=1)
+gen_settings = ExLlamaV2Sampler.Settings(token_repetition_penalty=1.0, temperature=1.0, top_k=10, top_p=1.0)
+token_map = [tokenizer.single_id(" " + c) for c in "ABCD"]
+token_rmap = {token_map[i]: i for i in range(len("ABCD"))}
+gen_settings.allow_tokens(tokenizer, token_map)
 
-            progress.update(task1, advance = 1)
+progress = Progress(TextColumn("[progress.description]{task.description}"), BarColumn(), TaskProgressColumn(), TextColumn("{task.completed}/{task.total}"))
 
-# Summarize
+with progress:
+    preprompt_task = progress.add_task("[red]Preparing preprompts", total=len(all_subjects))
+    preprompt_ids = prepare_preprompts(all_subjects, dataset_dev, tokenizer, args, progress, preprompt_task)
 
-total = 0
-correct = 0
-confidence_sum = 0.0
+    total_jobs = sum(1 for q in dataset_all if q["subject"] in all_subjects)
+    preparation_task = progress.add_task("[green]Preparing questions", total=total_jobs)
+    processing_task = progress.add_task("[blue]Processing questions", total=total_jobs)
 
-for q in dataset_all:
-    if not "answer_correct" in q:
-        continue
-    total += 1
-    if q["answer_correct"]:
-        correct += 1
-    confidence_sum += q["correct_answer_confidence"]
+    for job_batch in prepare_jobs(dataset_all, all_subjects, preprompt_ids, tokenizer, gen_settings, args.batch_size, progress, preparation_task):
+        process_batch(generator, job_batch, token_map, token_rmap, progress, processing_task)
 
-print(f"Correct answers: {correct}/{total} = {correct/total*100:.2f}%")
-print(f"Confidence: {confidence_sum/total*100:.2f}%")
\ No newline at end of file
+summarize_results(dataset_all)
\ No newline at end of file