Adding gsm8k experiments

scallop-lang · Feb 23, 2024 · 2f01a6f · 2f01a6f
1 parent 3dbe1e9
commit 2f01a6f
Show file tree

Hide file tree

Showing 12 changed files with 886 additions and 0 deletions.
diff --git a/experiments/gsm8k/.gitignore b/experiments/gsm8k/.gitignore
@@ -0,0 +1,2 @@
+*.json
+extra*.py
diff --git a/experiments/gsm8k/dataset_loader.py b/experiments/gsm8k/dataset_loader.py
@@ -0,0 +1,18 @@
+import os
+import json
+
+class GSM8KDataset:
+  def __init__(self):
+    self.dataset_loc = "../../../gsm8k/test.jsonl"
+
+    with open(self.dataset_loc) as dataset:
+      self.data = list(json.loads(obj) for obj in dataset)
+
+  def __len__(self):
+    return len(self.data)
+
+  def __getitem__(self, i):
+    question = self.data[i]["question"]
+    answer = float(self.data[i]["answer"].split()[-1].replace(',', '')) # The answer is always given at the end as the last word
+
+    return (question, answer)
diff --git a/experiments/gsm8k/gsm8k_baseline.py b/experiments/gsm8k/gsm8k_baseline.py
@@ -0,0 +1,72 @@
+import openai
+import json
+from tqdm import tqdm
+from io import StringIO
+import sys
+
+from dataset_loader import GSM8KDataset
+
+TASK = GSM8KDataset()
+N = len(TASK)
+MARGIN = 0.001
+
+HEADER = '''
+In this task, you will be given a math word problem to solve.
+Please output your reasoning as a chain of thought and then output your answer on a new line at the end. For the final answer, do not include any non-numerical digits except for a decimal point when applicable.
+
+Here are some examples:
+
+Question: Lisa, Jack, and Tommy earned $60 from washing cars all week. However, half of the $60 was earned by Lisa. Tommy earned half of what Lisa earned. How much more money did Lisa earn than Tommy?
+Answer: Lisa earned $60 * 1/2 = $30.\nTommy earned $30 * 1/2 = $15.\nLisa earned $30 - $15 = $15 more than Tommy.
+
+15
+
+Now, answer the following question:
+
+\n
+Question:\s
+'''
+
+def run_gpt(question):
+    messages = [{"role": "user", "content": HEADER + question + "\nAnswer: "}]
+    response = openai.ChatCompletion.create(
+        model="gpt-4",
+        messages=messages,
+        temperature=0,
+    )
+    return response["choices"][0]["message"]["content"]
+
+def test_gsm8k(range):
+    out = {"exact_score": 0, "margin_score": 0, "data": []}
+
+    for i in tqdm(range):
+        input, ans = TASK[i]
+
+        try:
+            output_ans = run_gpt(input)
+            final_ans = output_ans.split('\n')[-1].replace(',', '')
+            exact_score = int(float(final_ans) == float(ans))
+            margin_score = int(abs(float(final_ans) - float(ans)) <= MARGIN)
+            out["exact_score"] += exact_score
+            out["margin_score"] += margin_score
+            out["data"] += [
+                {
+                    "id": i,
+                    "question": input,
+                    "reasoning": output_ans,
+                    "answer": final_ans,
+                    "correct_answer": ans,
+                    "exact_score": exact_score,
+                    "margin_score": margin_score,
+                }
+            ]
+        except Exception as e:
+            out["data"] += [
+                {"id": i, "question": input, "exception": str(e), "exact_score": 0, "margin_score": 0}
+            ]
+
+        json_object = json.dumps(out.copy(), indent=4)
+        with open("data_baseline.json", "w") as outfile:
+            outfile.write(json_object)
+
+test_gsm8k(range(N))
diff --git a/experiments/gsm8k/gsm8k_cot.py b/experiments/gsm8k/gsm8k_cot.py
@@ -0,0 +1,72 @@
+import openai
+import json
+from tqdm import tqdm
+from io import StringIO
+import sys
+
+from dataset_loader import GSM8KDataset
+
+TASK = GSM8KDataset()
+N = len(TASK)
+MARGIN = 0.001
+
+HEADER = '''
+In this task, you will be given a math word problem to solve.
+Please output your reasoning as a chain of thought and then output your answer on a new line at the end. For the final answer, do not include any non-numerical digits except for a decimal point when applicable.
+
+Here are some examples:
+
+Question: Lisa, Jack, and Tommy earned $60 from washing cars all week. However, half of the $60 was earned by Lisa. Tommy earned half of what Lisa earned. How much more money did Lisa earn than Tommy?
+Answer: Let's think step by step. First, we know that Lisa earned half of the $60, which is $60 / 2 = $30. Then, we know that Tommy earned half of what Lisa earned, which is $30 / 2 = $15. So, Lisa earned $30 - $15 = $15 more than Tommy.
+
+15
+
+Now, answer the following question:
+
+\n
+Question:\s
+'''
+
+def run_gpt(question):
+    messages = [{"role": "user", "content": HEADER + question + "\nAnswer: Let's think step by step. "}]
+    response = openai.ChatCompletion.create(
+        model="gpt-4",
+        messages=messages,
+        temperature=0,
+    )
+    return response["choices"][0]["message"]["content"]
+
+def test_gsm8k(range):
+    out = {"exact_score": 0, "margin_score": 0, "data": []}
+
+    for i in tqdm(range):
+        input, ans = TASK[i]
+
+        try:
+            output_ans = run_gpt(input)
+            final_ans = output_ans.split('\n')[-1].replace(',', '')
+            exact_score = int(float(final_ans) == float(ans))
+            margin_score = int(abs(float(final_ans) - float(ans)) <= MARGIN)
+            out["exact_score"] += exact_score
+            out["margin_score"] += margin_score
+            out["data"] += [
+                {
+                    "id": i,
+                    "question": input,
+                    "reasoning": output_ans,
+                    "answer": final_ans,
+                    "correct_answer": ans,
+                    "exact_score": exact_score,
+                    "margin_score": margin_score,
+                }
+            ]
+        except Exception as e:
+            out["data"] += [
+                {"id": i, "question": input, "exception": str(e), "exact_score": 0, "margin_score": 0}
+            ]
+
+        json_object = json.dumps(out.copy(), indent=4)
+        with open("data_baseline.json", "w") as outfile:
+            outfile.write(json_object)
+
+test_gsm8k(range(N))
diff --git a/experiments/gsm8k/gsm8k_py_expr.py b/experiments/gsm8k/gsm8k_py_expr.py
@@ -0,0 +1,92 @@
+import json
+from tqdm import tqdm
+from io import StringIO
+import sys
+
+import scallopy
+import scallopy_ext
+
+from dataset_loader import GSM8KDataset
+
+TASK = GSM8KDataset()
+N = len(TASK)
+SCALLOP_FILE = "py_expr_steps.scl"
+MARGIN = 0.001
+
+class Args:
+    def __init__(self):
+        self.cuda = False
+        self.gpu = None
+        self.num_allowed_openai_request = 100
+        self.openai_gpt_model = "gpt-4"
+        self.openai_gpt_temperature = 0
+
+def test_py_expr_parsing(range=range(N)):
+    out = {"exact_score": 0, "margin_score": 0, "data": [], "logs": []}
+
+    for i in tqdm(range):
+        question, answer = TASK[i]
+
+        buffer = StringIO()
+        sys.stdout = buffer
+        try:
+            ctx = scallopy.ScallopContext(provenance="unit")
+            scallopy_ext.config.configure(Args(), [])
+            scallopy_ext.extlib.load_extlib(ctx)
+            ctx.import_file(SCALLOP_FILE)
+            ctx.add_facts("question", [(question,)])
+            ctx.run()
+            res = list(ctx.relation("result"))
+            exact_score = 0
+            margin_score = 0
+            final_answer = None
+            for output_ans, in res:
+                if float(output_ans) == answer:
+                    exact_score = 1
+                    margin_score = 1
+                    final_answer = output_ans
+                if exact_score == 0 and abs(float(output_ans) - answer) <= MARGIN:
+                    margin_score = 1
+                    final_answer = output_ans
+            out["data"] += [
+                {
+                    "id": i,
+                    "question": question,
+                    "correct_answer": answer,
+                    "final_answer": final_answer,
+                    "exact_score": exact_score,
+                    "margin_score": margin_score,
+                    "steps": list(ctx.relation("step")),
+                    "outputted_answers": list(ctx.relation("result")),
+                    "num_answers": len(res),
+                }
+            ]
+            out["exact_score"] += exact_score
+            out["margin_score"] += margin_score
+        except Exception as e:
+            out["data"] += [
+                {
+                    "id": i,
+                    "question": question,
+                    "exception": str(e),
+                    "exact_score": 0,
+                    "margin_score": 0,
+                }
+            ]
+
+        out["logs"] += [
+            {
+                "id": i,
+                "log": buffer.getvalue().encode("utf-8").decode("unicode_escape"),
+            }
+        ]
+
+        json_object = json.dumps(out.copy(), indent=2)
+        with open("data_py_expr.json", "w") as outfile:
+            outfile.write(json_object)
+
+    sys.stdout = sys.__stdout__
+    print(out["exact_score"])
+
+if __name__ == "__main__":
+    test_py_expr_parsing(range(N))
diff --git a/experiments/gsm8k/gsm8k_scallop.py b/experiments/gsm8k/gsm8k_scallop.py
@@ -0,0 +1,91 @@
+import json
+from tqdm import tqdm
+from io import StringIO
+import sys
+
+import scallopy
+import scallopy_ext
+
+from dataset_loader import GSM8KDataset
+
+TASK = GSM8KDataset()
+N = len(TASK)
+SCALLOP_FILE = "semantic_parser.scl"
+MARGIN = 0.001
+
+class Args:
+    def __init__(self):
+        self.cuda = False
+        self.gpu = None
+        self.num_allowed_openai_request = 100
+        self.openai_gpt_model = "gpt-4"
+        self.openai_gpt_temperature = 0
+
+def test_semantic_parser(range=range(N)):
+    out = {"exact_score": 0, "margin_score": 0, "data": [], "logs": []}
+
+    for i in tqdm(range):
+        question, answer = TASK[i]
+
+        buffer = StringIO()
+        sys.stdout = buffer
+        try:
+            ctx = scallopy.ScallopContext(provenance="unit")
+            scallopy_ext.config.configure(Args(), [])
+            scallopy_ext.extlib.load_extlib(ctx)
+            ctx.import_file(SCALLOP_FILE)
+            ctx.add_facts("question", [(question,)])
+            ctx.run()
+            res = list(ctx.relation("result"))
+            exact_score = 0
+            margin_score = 0
+            final_answer = None
+            for output_ans, in res:
+                if float(output_ans) == answer:
+                    exact_score = 1
+                    margin_score = 1
+                    final_answer = output_ans
+                if exact_score == 0 and abs(float(output_ans) - answer) <= MARGIN:
+                    margin_score = 1
+                    final_answer = output_ans
+            out["data"] += [
+                {
+                    "id": i,
+                    "question": question,
+                    "correct_answer": answer,
+                    "final_answer": final_answer,
+                    "exact_score": exact_score,
+                    "margin_score": margin_score,
+                    "parsed_expr": list(ctx.relation("parsed_expr")),
+                    "outputted_answers": list(ctx.relation("result")),
+                    "num_answers": len(res),
+                }
+            ]
+            out["exact_score"] += exact_score
+            out["margin_score"] += margin_score
+        except Exception as e:
+            out["data"] += [
+                {
+                    "id": i,
+                    "question": question,
+                    "exception": str(e),
+                    "exact_score": 0,
+                    "margin_score": 0,
+                }
+            ]
+
+        out["logs"] += [
+            {
+                "id": i,
+                "log": buffer.getvalue().encode("utf-8").decode("unicode_escape"),
+            }
+        ]
+
+        json_object = json.dumps(out.copy(), indent=2)
+        with open("data_scallop.json", "w") as outfile:
+            outfile.write(json_object)
+
+    sys.stdout = sys.__stdout__
+    print(out["exact_score"])
+
+test_semantic_parser()