From 2f01a6fbf4b520c57e5d73e8da3b93ffa72144af Mon Sep 17 00:00:00 2001
From: Ziyang Li <liby99@icloud.com>
Date: Fri, 23 Feb 2024 09:22:04 -0500
Subject: [PATCH] Adding gsm8k experiments

---
 experiments/gsm8k/.gitignore                |   2 +
 experiments/gsm8k/dataset_loader.py         |  18 ++
 experiments/gsm8k/gsm8k_baseline.py         |  72 ++++++++
 experiments/gsm8k/gsm8k_cot.py              |  72 ++++++++
 experiments/gsm8k/gsm8k_py_expr.py          |  92 ++++++++++
 experiments/gsm8k/gsm8k_scallop.py          |  91 ++++++++++
 experiments/gsm8k/gsm8k_scallop_cot.py      | 141 ++++++++++++++++
 experiments/gsm8k/gsm8k_scallop_steps.py    |  95 +++++++++++
 experiments/gsm8k/py_expr_steps.scl         |  64 +++++++
 experiments/gsm8k/semantic_parser.scl       |  49 ++++++
 experiments/gsm8k/semantic_parser_cot.scl   |  14 ++
 experiments/gsm8k/semantic_parser_steps.scl | 176 ++++++++++++++++++++
 12 files changed, 886 insertions(+)
 create mode 100644 experiments/gsm8k/.gitignore
 create mode 100644 experiments/gsm8k/dataset_loader.py
 create mode 100644 experiments/gsm8k/gsm8k_baseline.py
 create mode 100644 experiments/gsm8k/gsm8k_cot.py
 create mode 100644 experiments/gsm8k/gsm8k_py_expr.py
 create mode 100644 experiments/gsm8k/gsm8k_scallop.py
 create mode 100644 experiments/gsm8k/gsm8k_scallop_cot.py
 create mode 100644 experiments/gsm8k/gsm8k_scallop_steps.py
 create mode 100644 experiments/gsm8k/py_expr_steps.scl
 create mode 100644 experiments/gsm8k/semantic_parser.scl
 create mode 100644 experiments/gsm8k/semantic_parser_cot.scl
 create mode 100644 experiments/gsm8k/semantic_parser_steps.scl

diff --git a/experiments/gsm8k/.gitignore b/experiments/gsm8k/.gitignore
new file mode 100644
index 0000000..c189074
--- /dev/null
+++ b/experiments/gsm8k/.gitignore
@@ -0,0 +1,2 @@
+*.json
+extra*.py
\ No newline at end of file
diff --git a/experiments/gsm8k/dataset_loader.py b/experiments/gsm8k/dataset_loader.py
new file mode 100644
index 0000000..b28be45
--- /dev/null
+++ b/experiments/gsm8k/dataset_loader.py
@@ -0,0 +1,18 @@
+import os
+import json
+
+class GSM8KDataset:
+  def __init__(self):
+    self.dataset_loc = "../../../gsm8k/test.jsonl"
+    
+    with open(self.dataset_loc) as dataset:
+      self.data = list(json.loads(obj) for obj in dataset)
+
+  def __len__(self):
+    return len(self.data)
+
+  def __getitem__(self, i):
+    question = self.data[i]["question"]
+    answer = float(self.data[i]["answer"].split()[-1].replace(',', '')) # The answer is always given at the end as the last word
+
+    return (question, answer)
diff --git a/experiments/gsm8k/gsm8k_baseline.py b/experiments/gsm8k/gsm8k_baseline.py
new file mode 100644
index 0000000..ab8bcf8
--- /dev/null
+++ b/experiments/gsm8k/gsm8k_baseline.py
@@ -0,0 +1,72 @@
+import openai
+import json
+from tqdm import tqdm
+from io import StringIO
+import sys
+
+from dataset_loader import GSM8KDataset
+
+TASK = GSM8KDataset()
+N = len(TASK)
+MARGIN = 0.001
+
+HEADER = '''
+In this task, you will be given a math word problem to solve.
+Please output your reasoning as a chain of thought and then output your answer on a new line at the end. For the final answer, do not include any non-numerical digits except for a decimal point when applicable.
+
+Here are some examples:
+
+Question: Lisa, Jack, and Tommy earned $60 from washing cars all week. However, half of the $60 was earned by Lisa. Tommy earned half of what Lisa earned. How much more money did Lisa earn than Tommy?
+Answer: Lisa earned $60 * 1/2 = $30.\nTommy earned $30 * 1/2 = $15.\nLisa earned $30 - $15 = $15 more than Tommy.
+
+15
+
+Now, answer the following question:
+
+\n
+Question:\s
+'''
+
+def run_gpt(question):
+    messages = [{"role": "user", "content": HEADER + question + "\nAnswer: "}]
+    response = openai.ChatCompletion.create(
+        model="gpt-4",
+        messages=messages,
+        temperature=0,
+    )
+    return response["choices"][0]["message"]["content"]
+
+def test_gsm8k(range):
+    out = {"exact_score": 0, "margin_score": 0, "data": []}
+
+    for i in tqdm(range):
+        input, ans = TASK[i]
+
+        try:
+            output_ans = run_gpt(input)
+            final_ans = output_ans.split('\n')[-1].replace(',', '')
+            exact_score = int(float(final_ans) == float(ans))
+            margin_score = int(abs(float(final_ans) - float(ans)) <= MARGIN)
+            out["exact_score"] += exact_score
+            out["margin_score"] += margin_score
+            out["data"] += [
+                {
+                    "id": i,
+                    "question": input,
+                    "reasoning": output_ans,
+                    "answer": final_ans,
+                    "correct_answer": ans,
+                    "exact_score": exact_score,
+                    "margin_score": margin_score,
+                }
+            ]
+        except Exception as e:
+            out["data"] += [
+                {"id": i, "question": input, "exception": str(e), "exact_score": 0, "margin_score": 0}
+            ]
+        
+        json_object = json.dumps(out.copy(), indent=4)
+        with open("data_baseline.json", "w") as outfile:
+            outfile.write(json_object)
+
+test_gsm8k(range(N))
\ No newline at end of file
diff --git a/experiments/gsm8k/gsm8k_cot.py b/experiments/gsm8k/gsm8k_cot.py
new file mode 100644
index 0000000..6c2e995
--- /dev/null
+++ b/experiments/gsm8k/gsm8k_cot.py
@@ -0,0 +1,72 @@
+import openai
+import json
+from tqdm import tqdm
+from io import StringIO
+import sys
+
+from dataset_loader import GSM8KDataset
+
+TASK = GSM8KDataset()
+N = len(TASK)
+MARGIN = 0.001
+
+HEADER = '''
+In this task, you will be given a math word problem to solve.
+Please output your reasoning as a chain of thought and then output your answer on a new line at the end. For the final answer, do not include any non-numerical digits except for a decimal point when applicable.
+
+Here are some examples:
+
+Question: Lisa, Jack, and Tommy earned $60 from washing cars all week. However, half of the $60 was earned by Lisa. Tommy earned half of what Lisa earned. How much more money did Lisa earn than Tommy?
+Answer: Let's think step by step. First, we know that Lisa earned half of the $60, which is $60 / 2 = $30. Then, we know that Tommy earned half of what Lisa earned, which is $30 / 2 = $15. So, Lisa earned $30 - $15 = $15 more than Tommy.
+
+15
+
+Now, answer the following question:
+
+\n
+Question:\s
+'''
+
+def run_gpt(question):
+    messages = [{"role": "user", "content": HEADER + question + "\nAnswer: Let's think step by step. "}]
+    response = openai.ChatCompletion.create(
+        model="gpt-4",
+        messages=messages,
+        temperature=0,
+    )
+    return response["choices"][0]["message"]["content"]
+
+def test_gsm8k(range):
+    out = {"exact_score": 0, "margin_score": 0, "data": []}
+
+    for i in tqdm(range):
+        input, ans = TASK[i]
+
+        try:
+            output_ans = run_gpt(input)
+            final_ans = output_ans.split('\n')[-1].replace(',', '')
+            exact_score = int(float(final_ans) == float(ans))
+            margin_score = int(abs(float(final_ans) - float(ans)) <= MARGIN)
+            out["exact_score"] += exact_score
+            out["margin_score"] += margin_score
+            out["data"] += [
+                {
+                    "id": i,
+                    "question": input,
+                    "reasoning": output_ans,
+                    "answer": final_ans,
+                    "correct_answer": ans,
+                    "exact_score": exact_score,
+                    "margin_score": margin_score,
+                }
+            ]
+        except Exception as e:
+            out["data"] += [
+                {"id": i, "question": input, "exception": str(e), "exact_score": 0, "margin_score": 0}
+            ]
+        
+        json_object = json.dumps(out.copy(), indent=4)
+        with open("data_baseline.json", "w") as outfile:
+            outfile.write(json_object)
+
+test_gsm8k(range(N))
\ No newline at end of file
diff --git a/experiments/gsm8k/gsm8k_py_expr.py b/experiments/gsm8k/gsm8k_py_expr.py
new file mode 100644
index 0000000..5f7156a
--- /dev/null
+++ b/experiments/gsm8k/gsm8k_py_expr.py
@@ -0,0 +1,92 @@
+import json
+from tqdm import tqdm
+from io import StringIO
+import sys
+
+import scallopy
+import scallopy_ext
+
+from dataset_loader import GSM8KDataset
+
+TASK = GSM8KDataset()
+N = len(TASK)
+SCALLOP_FILE = "py_expr_steps.scl"
+MARGIN = 0.001
+
+class Args:
+    def __init__(self):
+        self.cuda = False
+        self.gpu = None
+        self.num_allowed_openai_request = 100
+        self.openai_gpt_model = "gpt-4"
+        self.openai_gpt_temperature = 0
+
+def test_py_expr_parsing(range=range(N)):
+    out = {"exact_score": 0, "margin_score": 0, "data": [], "logs": []}
+
+    for i in tqdm(range):
+        question, answer = TASK[i]
+
+        buffer = StringIO()
+        sys.stdout = buffer
+        try:
+            ctx = scallopy.ScallopContext(provenance="unit")
+            scallopy_ext.config.configure(Args(), [])
+            scallopy_ext.extlib.load_extlib(ctx)
+            ctx.import_file(SCALLOP_FILE)
+            ctx.add_facts("question", [(question,)])
+            ctx.run()
+            res = list(ctx.relation("result"))
+            exact_score = 0
+            margin_score = 0
+            final_answer = None
+            for output_ans, in res:
+                if float(output_ans) == answer:
+                    exact_score = 1
+                    margin_score = 1
+                    final_answer = output_ans
+                if exact_score == 0 and abs(float(output_ans) - answer) <= MARGIN:
+                    margin_score = 1
+                    final_answer = output_ans
+            out["data"] += [
+                {
+                    "id": i,
+                    "question": question,
+                    "correct_answer": answer,
+                    "final_answer": final_answer,
+                    "exact_score": exact_score,
+                    "margin_score": margin_score,
+                    "steps": list(ctx.relation("step")),
+                    "outputted_answers": list(ctx.relation("result")),
+                    "num_answers": len(res),
+                }
+            ]
+            out["exact_score"] += exact_score
+            out["margin_score"] += margin_score
+        except Exception as e:
+            out["data"] += [
+                {
+                    "id": i,
+                    "question": question,
+                    "exception": str(e),
+                    "exact_score": 0,
+                    "margin_score": 0,
+                }
+            ]
+
+        out["logs"] += [
+            {
+                "id": i,
+                "log": buffer.getvalue().encode("utf-8").decode("unicode_escape"),
+            }
+        ]
+
+        json_object = json.dumps(out.copy(), indent=2)
+        with open("data_py_expr.json", "w") as outfile:
+            outfile.write(json_object)
+
+    sys.stdout = sys.__stdout__
+    print(out["exact_score"])
+
+if __name__ == "__main__":
+    test_py_expr_parsing(range(N))
diff --git a/experiments/gsm8k/gsm8k_scallop.py b/experiments/gsm8k/gsm8k_scallop.py
new file mode 100644
index 0000000..6f2e418
--- /dev/null
+++ b/experiments/gsm8k/gsm8k_scallop.py
@@ -0,0 +1,91 @@
+import json
+from tqdm import tqdm
+from io import StringIO
+import sys
+
+import scallopy
+import scallopy_ext
+
+from dataset_loader import GSM8KDataset
+
+TASK = GSM8KDataset()
+N = len(TASK)
+SCALLOP_FILE = "semantic_parser.scl"
+MARGIN = 0.001
+
+class Args:
+    def __init__(self):
+        self.cuda = False
+        self.gpu = None
+        self.num_allowed_openai_request = 100
+        self.openai_gpt_model = "gpt-4"
+        self.openai_gpt_temperature = 0
+
+def test_semantic_parser(range=range(N)):
+    out = {"exact_score": 0, "margin_score": 0, "data": [], "logs": []}
+
+    for i in tqdm(range):
+        question, answer = TASK[i]
+
+        buffer = StringIO()
+        sys.stdout = buffer
+        try:
+            ctx = scallopy.ScallopContext(provenance="unit")
+            scallopy_ext.config.configure(Args(), [])
+            scallopy_ext.extlib.load_extlib(ctx)
+            ctx.import_file(SCALLOP_FILE)
+            ctx.add_facts("question", [(question,)])
+            ctx.run()
+            res = list(ctx.relation("result"))
+            exact_score = 0
+            margin_score = 0
+            final_answer = None
+            for output_ans, in res:
+                if float(output_ans) == answer:
+                    exact_score = 1
+                    margin_score = 1
+                    final_answer = output_ans
+                if exact_score == 0 and abs(float(output_ans) - answer) <= MARGIN:
+                    margin_score = 1
+                    final_answer = output_ans
+            out["data"] += [
+                {
+                    "id": i,
+                    "question": question,
+                    "correct_answer": answer,
+                    "final_answer": final_answer,
+                    "exact_score": exact_score,
+                    "margin_score": margin_score,
+                    "parsed_expr": list(ctx.relation("parsed_expr")),
+                    "outputted_answers": list(ctx.relation("result")),
+                    "num_answers": len(res),
+                }
+            ]
+            out["exact_score"] += exact_score
+            out["margin_score"] += margin_score
+        except Exception as e:
+            out["data"] += [
+                {
+                    "id": i,
+                    "question": question,
+                    "exception": str(e),
+                    "exact_score": 0,
+                    "margin_score": 0,
+                }
+            ]
+
+        out["logs"] += [
+            {
+                "id": i,
+                "log": buffer.getvalue().encode("utf-8").decode("unicode_escape"),
+            }
+        ]
+
+        json_object = json.dumps(out.copy(), indent=2)
+        with open("data_scallop.json", "w") as outfile:
+            outfile.write(json_object)
+
+    sys.stdout = sys.__stdout__
+    print(out["exact_score"])
+          
+test_semantic_parser()
\ No newline at end of file
diff --git a/experiments/gsm8k/gsm8k_scallop_cot.py b/experiments/gsm8k/gsm8k_scallop_cot.py
new file mode 100644
index 0000000..72b77c9
--- /dev/null
+++ b/experiments/gsm8k/gsm8k_scallop_cot.py
@@ -0,0 +1,141 @@
+import json
+from tqdm import tqdm
+from io import StringIO
+import sys
+
+import scallopy
+import scallopy_ext
+
+import openai
+
+from dataset_loader import GSM8KDataset
+
+TASK = GSM8KDataset()
+N = len(TASK)
+SCALLOP_FILE = "semantic_parser_cot.scl"
+MARGIN = 0.001
+
+HEADER = """
+Suppose we have the following symbolic expression language:
+
+Expr ::= Const(float) | Add(Expr, Expr) | Sub(Expr, Expr) | Mult(Expr, Expr) | Div(Expr, Expr)
+
+In the following task, you will be given a math word problem. Please semantically parse it into a symbolic expression.
+First, output your reasoning, then put the symbolic expression on a new line.
+
+Here are a few examples:
+
+Question: Lisa, Jack, and Tommy earned $60 from washing cars all week. However, half of the $60 was earned by Lisa. Tommy earned half of what Lisa earned. How much more money did Lisa earn than Tommy?
+Answer: Let's think step by step. First, we know that Lisa earned half of the $60, which corresponds to Mult(Const(0.5), Const(60)). Then, we know that Tommy earned half of what Lisa earned (which was Mult(Const(0.5), Const(60))), which corresponds to Mult(Const(0.5), Mult(Const(0.5), Const(60))). \
+Finally, to see how much more money Lisa earned than Tommy, we must subtract the amount Tommy earned, Mult(Const(0.5), Mult(Const(0.5), Const(60))), from the amount that Lisa earned, Mult(Const(0.5), Const(60)). This gives us a final answer of \
+Sub(Mult(Const(0.5), Const(60)), Mult(Const(0.5), Mult(Const(0.5), Const(60)))).
+
+Sub(Mult(Const(0.5), Const(60)), Mult(Const(0.5), Mult(Const(0.5), Const(60))))
+
+Question: Colton had 72 dolphin stickers. He gave 4 stickers each to 3 friends.  He also gave his friend Mandy 2 more than he gave his three friends total.   And he gave Justin 10 less than Mandy.  How many stickers does Colton have left?
+Answer: Let's think step by step. To find the number of stickers Colton has left, we must subtract the number of stickers he gave away to all of his friends from 72 or Const(72), which is the number of stickers he started with. We will first add up \
+the total number of stickers he gave away.
+
+First, he gave 4 stickers each to the first 3 friends, which gives a total of Mult(Const(4), Const(3)) stickers. Then, he gave Mandy 2 more than he gave his three friends total. Since he gave his three friends \
+Mult(Const(4), Const(3)) total, this means that he gave Mandy Add(Mult(Const(4), Const(3)), Const(2)) \
+stickers. So far, Colton has given away Add(Mult(Const(4), Const(3)), Add(Mult(Const(4), Const(3)), Const(2))) \
+stickers. Next, Colton gave Justin 10 fewer stickers than Mandy. Since he gave Mandy Add(Mult(Const(4), Const(3)), Const(2)) \
+stickers, this means that he gave Justin Sub(Add(Mult(Const(4), Const(3)), Const(2)), Const(10)) stickers. \
+Since we must also add the stickers Colton gave Justin to the total number of stickers Colton gave away, \
+Colton has given away Add(Add(Mult(Const(4), Const(3)), Add(Mult(Const(4), Const(3)), Const(2))), Sub(Add(Mult(Const(4), Const(3)), Const(2)), Const(10))) \
+stickers in total.
+
+Finally, since Colton started out with Const(72) stickers but gave away Add(Add(Mult(Const(4), Const(3)), Add(Mult(Const(4), Const(3)), Const(2))), Sub(Add(Mult(Const(4), Const(3)), Const(2)), Const(10))) \
+of them, Colton has Sub(Const(72), Add(Add(Mult(Const(4), Const(3)), Add(Mult(Const(4), Const(3)), Const(2))), Sub(Add(Mult(Const(4), Const(3)), Const(2)), Const(10)))) \
+stickers remaining.
+
+Sub(Const(72), Add(Add(Mult(Const(4), Const(3)), Add(Mult(Const(4), Const(3)), Const(2))), Sub(Add(Mult(Const(4), Const(3)), Const(2)), Const(10))))
+
+Now, please semantically parse the following question:
+
+Question: 
+"""
+
+class Args:
+    def __init__(self):
+        self.cuda = False
+        self.gpu = None
+        self.num_allowed_openai_request = 100
+        self.openai_gpt_model = "gpt-4"
+        self.openai_gpt_temperature = 0
+
+def run_gpt(question):
+    messages = [{"role": "user", "content": HEADER + question + "\nAnswer: Let's think step by step. "}]
+    response = openai.ChatCompletion.create(
+        model="gpt-4",
+        messages=messages,
+        temperature=0,
+    )
+    return response["choices"][0]["message"]["content"]
+
+def test_semantic_parser_cot(range=range(N)):
+    out = {"exact_score": 0, "margin_score": 0, "data": [], "logs": []}
+
+    for i in tqdm(range):
+        question, answer = TASK[i]
+
+        buffer = StringIO()
+        sys.stdout = buffer
+        try:
+            # GPT should put the parsed expression on the last line
+            output = run_gpt(question)
+            parsed_expr = output.split('\n')[-1]
+
+            ctx = scallopy.ScallopContext(provenance="unit")
+            scallopy_ext.config.configure(Args(), [])
+            scallopy_ext.extlib.load_extlib(ctx)
+            ctx.import_file(SCALLOP_FILE)
+            ctx.add_facts("parsed_expr", [(parsed_expr,)])
+            ctx.run()
+
+            final_ans, = list(ctx.relation("result"))[0]
+            exact_score = int(final_ans == answer)
+            margin_score = int(abs(final_ans - answer) <= MARGIN)
+
+            out["data"] += [
+                {
+                    "id": i,
+                    "question": question,
+                    "correct_answer": answer,
+                    "final_answer": final_ans,
+                    "exact_score": exact_score,
+                    "margin_score": margin_score,
+                    "reasoning": output,
+                    "parsed_expr": parsed_expr,
+                    "outputted_answers": list(ctx.relation("result")),
+                }
+            ]
+            out["exact_score"] += exact_score
+            out["margin_score"] += margin_score
+
+        except Exception as e:
+            out["data"] += [
+                {
+                    "id": i,
+                    "question": question,
+                    "exception": str(e),
+                    "exact_score": 0,
+                    "margin_score": 0,
+                }
+            ]
+        
+        out["logs"] += [
+            {
+                "id": i,
+                "log": buffer.getvalue().encode("utf-8").decode("unicode_escape"),
+            }
+        ]
+
+        json_object = json.dumps(out.copy(), indent=2)
+        with open("data_scallop_cot.json", "w") as outfile:
+            outfile.write(json_object)
+
+    sys.stdout = sys.__stdout__
+    print(out["exact_score"])
+
+test_semantic_parser_cot(range(N))
\ No newline at end of file
diff --git a/experiments/gsm8k/gsm8k_scallop_steps.py b/experiments/gsm8k/gsm8k_scallop_steps.py
new file mode 100644
index 0000000..dba88ce
--- /dev/null
+++ b/experiments/gsm8k/gsm8k_scallop_steps.py
@@ -0,0 +1,95 @@
+import json
+from tqdm import tqdm
+from io import StringIO
+import sys
+import time
+
+import scallopy
+import scallopy_ext
+
+from dataset_loader import GSM8KDataset
+
+TASK = GSM8KDataset()
+N = len(TASK)
+SCALLOP_FILE = "semantic_parser_steps.scl"
+MARGIN = 0.001
+
+class Args:
+    def __init__(self):
+        self.cuda = False
+        self.gpu = None
+        self.num_allowed_openai_request = 100
+        self.openai_gpt_model = "gpt-4"
+        self.openai_gpt_temperature = 0
+
+def test_semantic_parser(range=range(N)):
+    out = {"exact_score": 0, "margin_score": 0, "data": [], "logs": []}
+
+    for i in tqdm(range):
+        time.sleep(10)
+        question, answer = TASK[i]
+
+        buffer = StringIO()
+        sys.stdout = buffer
+        try:
+            ctx = scallopy.ScallopContext(provenance="unit")
+            scallopy_ext.config.configure(Args(), [])
+            scallopy_ext.extlib.load_extlib(ctx)
+            ctx.import_file(SCALLOP_FILE)
+            ctx.add_facts("question", [(question,)])
+            ctx.run()
+            res = list(ctx.relation("result"))
+            exact_score = 0
+            margin_score = 0
+            final_answer = None
+            for output_ans, in res:
+                if float(output_ans) == answer:
+                    exact_score = 1
+                    margin_score = 1
+                    final_answer = output_ans
+                if exact_score == 0 and abs(float(output_ans) - answer) <= MARGIN:
+                    margin_score = 1
+                    final_answer = output_ans
+            out["data"] += [
+                {
+                    "id": i,
+                    "question": question,
+                    "correct_answer": answer,
+                    "final_answer": final_answer,
+                    "exact_score": exact_score,
+                    "margin_score": margin_score,
+                    "steps": list(ctx.relation("step_as_string")),
+                    "goal": list(ctx.relation("goal")),
+                    "vars_evaluated": list(ctx.relation("var_result")),
+                    "outputted_answers": list(ctx.relation("result")),
+                    "num_answers": len(res),
+                }
+            ]
+            out["exact_score"] += exact_score
+            out["margin_score"] += margin_score
+        except Exception as e:
+            out["data"] += [
+                {
+                    "id": i,
+                    "question": question,
+                    "exception": str(e),
+                    "exact_score": 0,
+                    "margin_score": 0,
+                }
+            ]
+
+        out["logs"] += [
+            {
+                "id": i,
+                "log": buffer.getvalue().encode("utf-8").decode("unicode_escape"),
+            }
+        ]
+
+        json_object = json.dumps(out.copy(), indent=2)
+        with open("data_scallop_steps.json", "w") as outfile:
+            outfile.write(json_object)
+
+    sys.stdout = sys.__stdout__
+    print(out["exact_score"])
+          
+test_semantic_parser(range(N))
diff --git a/experiments/gsm8k/py_expr_steps.scl b/experiments/gsm8k/py_expr_steps.scl
new file mode 100644
index 0000000..459581f
--- /dev/null
+++ b/experiments/gsm8k/py_expr_steps.scl
@@ -0,0 +1,64 @@
+@gpt_extract_info(
+  header="Please extract the computation steps needed from the provided question. Make sure to output them in the order in which they should be computed and label them in order, making sure that the last step, when evaluated, will give the answer to the question.",
+  prompts=[
+    "let us first extract the steps as a JSON list. Do not provide the final goal",
+    // "what is the final goal represented as a JSON dict?"
+  ],
+  examples=[
+    (
+      ["Tom can catch 70 fish per week. How many days does he need to catch 100 fish?"],
+      [
+        [(0, "speed", "70 / 7"), (1, "time_needed", "100 / {speed}")],
+      ]
+    ),
+    (
+      ["Chris is deciding whether to sell apples or oranges. He can obtain either 100 apple trees that each make 10 apples a year or 3 orange trees that each make 30 oranges a year. Each apple can be sold for $0.10, while each orange can be sold for $1. Considering that Chris can either sell only apples or only oranges, what is the maximum revenue that he can make per year?"],
+      [
+        [
+          (0, "apples_per_year", "100 * 10"),
+          (1, "oranges_per_year", "3 * 30"),
+          (2, "apple_revenue", "{apples_per_year} * 0.10"),
+          (3, "orange_revenue", "{oranges_per_year} * 1"),
+          (4, "maximum_revenue", "max({apple_revenue}, {orange_revenue})")
+        ],
+      ]
+    )
+  ],
+  model="gpt-4",
+  cot=[true],
+)
+type extract_steps(bound question: String, step_id: i32, var: String, expr: String) // , extract_goal(bound question: String, var: String)
+
+type question(q: String)
+
+rel step(step_id, var, expr) = question(q) and extract_steps(q, step_id, var, expr)
+
+@py_eval type $py_eval_number(s: String) -> f32
+
+type Context = Cons(String, f32, Context) | Nil()
+
+const EMPTY_CTX = Nil()
+
+rel last_step(n) = n := max(n: step(n, _, _))
+rel return_variable(v) = last_step(n) and step(n, v, _)
+
+type find_value(bound ctx: Context, bound var: String, n: f32)
+rel find_value(ctx, v, n) = case ctx is Cons(v, n, _)
+rel find_value(ctx, v, n) = case ctx is Cons(vp, _, cont) and vp != v and find_value(cont, v, n)
+
+type process_eval_string(bound ctx: Context, bound eval_str: String, output_str: String)
+rel process_eval_string(ctx, e, e) = case ctx is Nil()
+rel process_eval_string(ctx, e1, $string_replace(e2, $format("{{}}", v), n as String)) = case ctx is Cons(v, n, cont) and process_eval_string(cont, e1, e2)
+
+type eval(bound s: String, bound ctx: Context, n: f32)
+rel eval(s, ctx, $py_eval_number(eval_str)) = process_eval_string(ctx, s, eval_str)
+
+type eval_step(id: i32, ctx: Context)
+rel eval_step(0, new Cons(v, n, EMPTY_CTX)) = step(0, v, e) and eval(e, EMPTY_CTX, n)
+rel eval_step(prev_id + 1, new Cons(v, n, prev_ctx)) = step(prev_id + 1, v, e) and eval_step(prev_id, prev_ctx) and eval(e, prev_ctx, n)
+
+rel final_step_ctx(ctx) = last_step(id) and eval_step(id, ctx)
+rel result(n) = final_step_ctx(ctx) and return_variable(v) and find_value(ctx, v, n)
+
+query step
+query result
diff --git a/experiments/gsm8k/semantic_parser.scl b/experiments/gsm8k/semantic_parser.scl
new file mode 100644
index 0000000..e4222a1
--- /dev/null
+++ b/experiments/gsm8k/semantic_parser.scl
@@ -0,0 +1,49 @@
+type Expr = Const(f32) | Add(Expr, Expr) | Sub(Expr, Expr) | Mult(Expr, Expr) | Div(Expr, Expr)
+
+type eval(bound e: Expr, v: f32)
+rel eval(e, v) = case e is Const(v)
+rel eval(e, v1 + v2) = case e is Add(e1, e2) and eval(e1, v1) and eval(e2, v2)
+rel eval(e, v1 - v2) = case e is Sub(e1, e2) and eval(e1, v1) and eval(e2, v2)
+rel eval(e, v1 * v2) = case e is Mult(e1, e2) and eval(e1, v1) and eval(e2, v2)
+rel eval(e, v1 / v2) = case e is Div(e1, e2) and eval(e1, v1) and eval(e2, v2)
+
+@gpt(
+    prompt="""
+Suppose we have the following symbolic expression language:
+
+Expr ::= Const(float) | Add(Expr, Expr) | Sub(Expr, Expr) | Mult(Expr, Expr) | Div(Expr, Expr)
+
+Please semantically parse the following question into a symbolic expression:
+
+Question: {{x}}
+Symbolic Program: {{y}}
+""",
+    examples=[
+        (
+            "Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?",
+            "Add(Const(48), Mult(Const(0.5), Const(48)))"
+        ),
+        // (
+        //     "Lisa, Jack, and Tommy earned $60 from washing cars all week. However, half of the $60 was earned by Lisa. Tommy earned half of what Lisa earned. How much more money did Lisa earn than Tommy?",
+        //     "Sub(Mult(Const(0.5), Const(60)), Mult(Const(0.5), Mult(Const(0.5), Const(60)))"
+        // ),
+        // (
+        //     "Arnel had ten boxes of pencils with the same number of pencils in each box.  He kept ten pencils and shared the remaining pencils equally with his five friends. If his friends got eight pencils each, how many pencils are in each box?",
+        //     "Add(Const(10), Mult(Const(5), Const(8)))"
+        // ),
+        // (
+        //     "Colton had 72 dolphin stickers. He gave 4 stickers each to 3 friends.  He also gave his friend Mandy 2 more than he gave his three friends total.   And he gave Justin 10 less than Mandy.  How many stickers does Colton have left?",
+        //     "Sub(Sub(Sub(Const(72), Mult(Const(4), Const(3))), Add(Const(2), Mult(Const(4), Const(3)))), Sub(Add(Const(2), Mult(Const(4), Const(3))), Const(10)))"
+        // ),
+    ],
+    model="gpt-4",
+    debug=true,
+) 
+type semantic_parser(bound x: String, y: Entity)
+
+type question(ctx: String)
+
+rel parsed_expr(s) = question(q) and semantic_parser(q, s)
+rel result(v) = parsed_expr(e) and eval(e, v)
+
+query result
\ No newline at end of file
diff --git a/experiments/gsm8k/semantic_parser_cot.scl b/experiments/gsm8k/semantic_parser_cot.scl
new file mode 100644
index 0000000..7e19e0e
--- /dev/null
+++ b/experiments/gsm8k/semantic_parser_cot.scl
@@ -0,0 +1,14 @@
+type Expr = Const(f32) | Add(Expr, Expr) | Sub(Expr, Expr) | Mult(Expr, Expr) | Div(Expr, Expr)
+
+type eval(bound e: Expr, v: f32)
+rel eval(e, v) = case e is Const(v)
+rel eval(e, v1 + v2) = case e is Add(e1, e2) and eval(e1, v1) and eval(e2, v2)
+rel eval(e, v1 - v2) = case e is Sub(e1, e2) and eval(e1, v1) and eval(e2, v2)
+rel eval(e, v1 * v2) = case e is Mult(e1, e2) and eval(e1, v1) and eval(e2, v2)
+rel eval(e, v1 / v2) = case e is Div(e1, e2) and eval(e1, v1) and eval(e2, v2)
+
+type parsed_expr(e: Expr)
+
+rel result(v) = parsed_expr(e) and eval(e, v)
+
+query result
\ No newline at end of file
diff --git a/experiments/gsm8k/semantic_parser_steps.scl b/experiments/gsm8k/semantic_parser_steps.scl
new file mode 100644
index 0000000..85b2a24
--- /dev/null
+++ b/experiments/gsm8k/semantic_parser_steps.scl
@@ -0,0 +1,176 @@
+@gpt_extract_info(
+header="""
+Please extract step-by-step program.
+
+type Expr = Const(f32) | Var(String) | Add(Expr, Expr) | Sub(Expr, Expr) | Mul(Expr, Expr) | Div(Expr, Expr) | Abs(Expr) | Max(Expr, Expr) | Min(Expr, Expr)
+
+For example, to solve the question
+
+Tom can catch 70 fish per week. How many days does he need to catch 100 fish?
+
+We need the following procedure
+
+speed = 70 / 7
+time_needed = 100 / speed
+
+The above procedure should be expressed in JSON as the following
+
+[
+{"var": "speed", "expr": "Div(70, 7)"},
+{"var": "time_needed", "expr": "Div(100, Var('speed'))"}
+]
+
+And the final goal is
+
+{"var": "time_needed"}
+
+Here is another example: given the following question
+
+Jesse and Mia are competing in a week long race. They have one week to run 30 miles. On the first three days Jesse averages (2/3) of a mile. On day four she runs 10 miles. Mia averages 3 miles a day over the first 4 days. What is the average of their average that they have to run over the final three days?
+
+We need the following procedure:
+
+jesse_first_three_days = 2/3 * 3
+jesse_first_four_days = jesse_first_three_days + 10
+mia_first_four_days = 3 * 4
+remaining_days = 7 - 4
+jesse_remaining_distance = 30 - jesse_first_four_days
+mia_remaining_distance = 30 - mia_first_four_days
+jesse_remaining_average = jesse_remaining_distance / remaining_days
+mia_remaining_average = mia_remaining_distance / remaining_days
+average_of_average = (jesse_remaining_average + mia_remaining_average) / 2
+
+The above procedure should be expressed in JSON as the following
+
+[
+{"var": "jesse_first_three_days", "expr": "Mul(Div(Const(2), Const(3)), Const(3))"},
+{"var": "jesse_first_four_days", "expr": "Add(Var('jesse_first_three_days'), Const(10))"},
+{"var": "mia_first_four_days", "expr": "Mul(Const(3), Const(4))"},
+{"var": "remaining_days", "expr": "Sub(Const(7), Const(4))"},
+{"var": "jesse_remaining_distance", "expr": "Sub(Const(30), Var('jesse_first_four_days'))"},
+{"var": "mia_remaining_distance", "expr": "Sub(Const(30), Var('mia_first_four_days'))"},
+{"var": "jesse_remaining_average", "expr": "Div(Var('jesse_remaining_distance'), Var('remaining_days'))"},
+{"var": "mia_remaining_average", "expr": "Div(Var('mia_remaining_distance'), Var('remaining_days'))"},
+{"var": "average_of_average", "expr": "Div(Add(Var('jesse_remaining_average'), Var('mia_remaining_average')), Const(2))"},
+]
+
+And the final goal is
+
+{"var": "average_of_average"}
+
+
+Here is one more example: given the following question
+
+A man is trying to maximize the amount of money he saves each month. In particular, he is trying to decide between two different apartments. The first apartment costs $800 per month in rent and will cost an additional $260 per month in utilities. The second apartment costs $900 per month and will cost an additional $200 per month in utilities. The first apartment is slightly further from the man's work, and the man would have to drive 31 miles per day to get to work. The second apartment is closer, and the man would only have to drive 21 miles to get to work. According to the IRS, each mile a person drives has an average cost of 58 cents. If the man must drive to work 20 days each month, what is the difference between the total monthly costs of these two apartments after factoring in utility and driving-related costs?
+
+We need the following procedure:
+
+first_monthly_cost = 800 + 260
+second_monthly_cost = 900 + 200
+first_driving_cost_daily = 31 * 0.58
+second_driving_cost_daily = 21 * 0.58
+num_days_worked = 20
+first_driving_cost_monthly = first_driving_cost_daily * num_days_worked
+second_driving_cost_monthly = second_driving_cost_daily * num_days_worked
+first_total_cost = first_monthly_cost + first_driving_cost_monthly
+second_total_cost = second_monthly_cost + second_driving_cost_monthly
+absolute_difference_in_cost = | (first_total_cost - second_total_cost) |
+
+The above procedure should be expressed in JSON as the following
+
+[
+{"var": "first_monthly_cost", "expr": "Add(Const(800), Const(260))"},
+{"var": "second_monthly_cost", "expr": "Add(Const(900), Const(200))"},
+{"var": "first_driving_cost_daily", "expr": "Mul(Const(31), Const(0.58))"},
+{"var": "second_driving_cost_daily", "expr": "Mul(Const(21), Const(0.58))"},
+{"var": "num_days_worked", "expr": "Const(20)"},
+{"var": "first_driving_cost_monthly", "expr": "Mul(Var(first_driving_cost_daily), Var(num_days_worked))"},
+{"var": "second_driving_cost_monthly", "expr": "Mul(Var(second_driving_cost_daily), Var(num_days_worked))"},
+{"var": "first_total_cost", "expr": "Add(Var(first_monthly_cost), Var(first_driving_cost_monthly))"},
+{"var": "second_total_cost", "expr": "Add(Var(second_monthly_cost), Var(second_driving_cost_monthly))"},
+{"var": "absolute_difference_in_cost", "expr": "Abs(Sub(Var(first_total_cost), Var(second_total_cost)))"}
+]
+
+And the final goal is 
+
+{"var": "absolute_difference_in_cost"}
+
+Here is one final example: given the following question:
+
+Chris is deciding whether to sell apples or oranges. He can obtain either 100 apple trees that each make 10 apples a year or 3 orange trees that each make 30 oranges a year. Each apple can be sold for $0.10, while each orange can be sold for $1. Considering that Chris can either sell only apples or only oranges, what is the maximum revenue that he can make per year?
+
+We need the following procedure:
+
+apples_per_year = 100 * 10
+oranges_per_year = 3 * 30
+apple_revenue = apples_per_year * 0.10
+orange_revenue = oranges_per_year * 1
+maximum_revenue = max(apple_revenue, orange_revenue)
+
+The above procedure should be expressed in JSON as the following
+
+[
+{"var": "apples_per_year", "expr": "Mul(Const(100), Const(10))"}
+{"var": "oranges_per_year", "expr": "Mul(Const(3), Const(30))"}
+{"var": "apple_revenue", "expr": "Mul(Var(apples_per_year), Const(0.10))"}
+{"var": "orange_revenue", "expr": "Mul(Var(oranges_per_year), Const(1))"}
+{"var": "maximum_revenue", "expr": "Max(Var(apple_revenue), Var(orange_revenue))"}
+]
+
+And the final goal is
+
+{"var": "maximum_revenue"}
+
+Now, please look at the following problem:
+
+{{question}}
+""",
+prompts=[
+"let us first extract the steps as a JSON list. Do not provide the final goal yet",
+"what is the final goal represented as a JSON dict?"
+],
+model="gpt-4",
+)
+type extract_steps(bound question: String, var: String, expr: Entity), extract_goal(bound question: String, var: String)
+
+type Expr = Const(f32) | Var(String) | Add(Expr, Expr) | Sub(Expr, Expr) | Mul(Expr, Expr) | Div(Expr, Expr) | Abs(Expr) | Max(Expr, Expr) | Min(Expr, Expr)
+
+rel to_string(e, $format("{}", n)) = case e is Const(n)
+rel to_string(e, $format("{}", v)) = case e is Var(v)
+rel to_string(e, $format("({} + {})", l, r)) = case e is Add(e1, e2) and to_string(e1, l) and to_string(e2, r)
+rel to_string(e, $format("({} - {})", l, r)) = case e is Sub(e1, e2) and to_string(e1, l) and to_string(e2, r)
+rel to_string(e, $format("({} * {})", l, r)) = case e is Mul(e1, e2) and to_string(e1, l) and to_string(e2, r)
+rel to_string(e, $format("({} / {})", l, r)) = case e is Div(e1, e2) and to_string(e1, l) and to_string(e2, r)
+rel to_string(e, $format("|{}|", s)) = case e is Abs(e1) and to_string(e1, s)
+rel to_string(e, $format("Max({}, {})", l, r)) = case e is Max(e1, e2) and to_string(e1, l) and to_string(e2, r)
+rel to_string(e, $format("Min({}, {})", l, r)) = case e is Min(e1, e2) and to_string(e1, l) and to_string(e2, r)
+
+type question(q: String)
+
+rel step(var, expr) = question(q) and extract_steps(q, var, expr)
+rel goal(var) = question(q) and extract_goal(q, var)
+
+rel eval(e, n) = case e is Const(n)
+rel eval(e, r) = case e is Var(v) and var_result(v, r)
+rel eval(e, a + b) = case e is Add(e1, e2) and eval(e1, a) and eval(e2, b)
+rel eval(e, a - b) = case e is Sub(e1, e2) and eval(e1, a) and eval(e2, b)
+rel eval(e, a * b) = case e is Mul(e1, e2) and eval(e1, a) and eval(e2, b)
+rel eval(e, a / b) = case e is Div(e1, e2) and eval(e1, a) and eval(e2, b)
+rel eval(e, val) = case e is Abs(e1) and eval(e1, val) and val >= 0
+rel eval(e, -val) = case e is Abs(e1) and eval(e1, val) and val < 0
+rel eval(e, a) = case e is Max(e1, e2) and eval(e1, a) and eval(e2, b) and a >= b
+rel eval(e, b) = case e is Max(e1, e2) and eval(e1, a) and eval(e2, b) and a < b
+rel eval(e, b) = case e is Min(e1, e2) and eval(e1, a) and eval(e2, b) and a >= b
+rel eval(e, a) = case e is Min(e1, e2) and eval(e1, a) and eval(e2, b) and a < b
+
+rel step_as_string(var, expr_as_string) = step(var, expr) and to_string(expr, expr_as_string)
+
+rel var_result(v, r) = step(v, e) and eval(e, r)
+
+rel result(r) = goal(v) and var_result(v, r)
+
+query question
+query goal
+query step_as_string
+query var_result
+query result
\ No newline at end of file