diff --git a/experiments/big-bench/date-understanding/.gitignore b/experiments/big-bench/date-understanding/.gitignore new file mode 100644 index 0000000..c7f85d0 --- /dev/null +++ b/experiments/big-bench/date-understanding/.gitignore @@ -0,0 +1,3 @@ +dataset +date_understanding.zip +logs diff --git a/experiments/big-bench/date-understanding/date-compute.scl b/experiments/big-bench/date-understanding/date-compute.scl new file mode 100644 index 0000000..9dcc80e --- /dev/null +++ b/experiments/big-bench/date-understanding/date-compute.scl @@ -0,0 +1,71 @@ +type question(question: String) + +@gpt_extract_info( + header="Please extract dates and their relationships from the provided question.", + prompts=[ + "What are the mentioned MM/DD/YYYY dates in JSON format?", + "What is the goal in JSON format?", + "What are the relationships of the dates in JSON format?" + ], + examples=[ + ( + ["Yesterday is February 14, 2019. What is the date 1 month ago?"], + [ + [("yesterday", "02/14/2019")], + [("1-month-ago")], + [("yesterday", "today", "1 day"), ("1-month-ago", "today", "1 month")] + ] + ), + ( + ["Yesterday is February 14, 2019. What is the date 1 year later?"], + [ + [("yesterday", "02/14/2019")], + [("1-year-later")], + [("yesterday", "today", "1 day"), ("today", "1-year-later", "1 year")] + ] + ), + ( + ["The deadline is August 15, 2023, which is today. What is the date today?"], + [ + [("deadline", "08/15/2023")], + [("today")], + [("today", "deadline", "0 days")] + ] + ), + ( + ["Jenny began her current job on the Christmas Eve of 2016. What is the 5th anniversary?"], + [ + [("start-date", "12/24/2016")], + [("anniversary")], + [("start-date", "anniversary", "5 years")] + ] + ), + ( + ["Today is March 5th, 2010. Mark earns $1000 per day starting from now. When can Mark earn $10000?"], + [ + [("start-date", "03/05/2010")], + [("earned-10000")], + [("start-date", "earned-10000", "10 days")] + ] + ) + ], + model="gpt-4", + cot=[false,false,true], + debug=true, +) +type extract_mentioned_date(bound question: String, label: String, date: DateTime), + extract_goal (bound question: String, goal: String), + extract_relationship (bound question: String, earlier_date: String, later_date: String, diff: Duration) + +// Extract information from the question +rel mentioned_date(label, date) = question(question) and extract_mentioned_date(question, label, date) +rel relationship(l1, l2, diff) = question(question) and extract_relationship(question, l1, l2, diff) +rel goal(label) = question(question) and extract_goal(question, label) + +// Do computation +rel derived_date(label, date) = mentioned_date(label, date) +rel derived_date(label, date - diff) = relationship(label, other, diff) and derived_date(other, date) +rel derived_date(label, date + diff) = relationship(other, label, diff) and derived_date(other, date) + +// Get the answer corresponding to the goal +rel answer(date) = goal(label) and derived_date(label, date) diff --git a/experiments/big-bench/date-understanding/readme.md b/experiments/big-bench/date-understanding/readme.md new file mode 100644 index 0000000..639fd5e --- /dev/null +++ b/experiments/big-bench/date-understanding/readme.md @@ -0,0 +1,10 @@ +# date understanding +Download and extract the dataset from [this link](https://drive.google.com/file/d/1aXUpQna54Dx4lt4wqRIdzTRanMzjXWjy/view?usp=drive_link). + +| Model | Score | Accuracy | +| --- | --- | --- | +| Scallop (5-shot + 0-shot CoT) | 341 | 92.41% | +| Scallop (5-shot) | 304 | 82.38% | +| GPT-4 (5-shot CoT) | 350 | **94.85%** | +| GPT-4 (0-shot CoT) | 322 | 87.26% | +| GPT-4 (0-shot) | 262 | 71.00% | \ No newline at end of file diff --git a/experiments/big-bench/date-understanding/test_gpt_baseline.py b/experiments/big-bench/date-understanding/test_gpt_baseline.py new file mode 100644 index 0000000..9557040 --- /dev/null +++ b/experiments/big-bench/date-understanding/test_gpt_baseline.py @@ -0,0 +1,71 @@ +import openai +import os +import json +from tqdm import tqdm + +DATASET = os.path.abspath(os.path.join(__file__, "../dataset/task-corrected.json")) +TASK = json.load(open(DATASET)) +ZERO_SHOT_HEADER = """Answer only in the format of MM/DD/YYYY, where MM is 2 digits for the month, DD is 2 digits for the day, and YYYY is 4 digits for the year. Do not include anything else with your answer. + +Examples: +April 20, 2021 is 04/20/2021 in MM/DD/YYYY form. + +Answer the following: +""" +FEW_SHOT_HEADER = """Answer only in the format of MM/DD/YYYY, where MM is 2 digits for the month, DD is 2 digits for the day, and YYYY is 4 digits for the year. Do not include anything else with your answer. + +Examples: +Question: The deadline is Jun 1, 2021, which is 2 days away from now. What is the date 10 days ago in MM/DD/YYYY? +05/20/2021 + +Question: Jenny began her current job on the Christmas Eve of 2016. What is the 5th anniversary in MM/DD/YYYY? +12/24/2021 + +Question: Today is March 5th, 2010. Mark earns $1000 per day starting from now. When can Mark earh $10000 in MM/DD/YYYY? +03/15/2010 + +Answer the following: +""" + + +def run_gpt(question): + messages = [{"role": "user", "content": ZERO_SHOT_HEADER + question}] + response = openai.ChatCompletion.create( + model="gpt-4", + messages=messages, + temperature=0, + ) + return response["choices"][0]["message"]["content"] + + +def test_date_understanding(): + out = {"score": 0, "data": []} + + pbar = tqdm(TASK["examples"]) + for example in pbar: + try: + answer = run_gpt(example["input"]) + score = int( + answer in example["target_scores"] and example["target_scores"][answer] + ) + out["score"] += score + out["data"] += [ + { + "question": example["input"], + "answer": answer, + "score": score, + } + ] + except Exception as e: + out["data"] += [ + {"question": example["input"], "exception": str(e), "score": 0} + ] + + pbar.set_postfix({"score": out["score"]}) + + json_object = json.dumps(out.copy(), indent=4) + with open("data.json", "w") as outfile: + outfile.write(json_object) + + +test_date_understanding() diff --git a/experiments/big-bench/date-understanding/test_gpt_cot.py b/experiments/big-bench/date-understanding/test_gpt_cot.py new file mode 100644 index 0000000..b95260a --- /dev/null +++ b/experiments/big-bench/date-understanding/test_gpt_cot.py @@ -0,0 +1,89 @@ +import openai +import os +import json +import re +from tqdm import tqdm + + +FEW_SHOT = True +SHOTS = """ +Examples: +Q: Yesterday is February 14, 2019. What is the date 1 month ago? +A: Let's think step by step. Since yesterday is February 14, 2019, that means today is February 15, 2019. 1 month ago from today is January 15, 2019. Therefore, the answer is 01/15/2019. + +Q: Yesterday is February 14, 2019. What is the date 1 year later? +A: Let's think step by step. Since yesterday is February 14, 2019, that means today is February 15, 2019. 1 year later from today is February 15, 2020. Therefore, the answer is 02/15/2020. + +Q: The deadline is August 15, 2023, which is today. What is the date today? +A: Let's think step by step. Since the deadline is August 15, 2023, and since today is the deadline, that means today is August 15, 2023. Therefore, the answer is 08/15/2023. + +Q: Jenny began her current job on the Christmas Eve of 2016. What is the 5th anniversary? +A: Let's think step by step. Since Jenny began her current job on Christmas Eve of 2016, that means she began on her job on December 24, 2016. The 5th anniversary is 5 years later from that date, so that date is December 24, 2021. Therefore, the answer is 12/24/2021. + +Q: Today is March 5th, 2010. Mark earns $1000 per day starting from now. When can Mark earn $10000? +A: Let's think step by step. Since Mark earns $1000 per day, it takes $10000 / $1000 = 10 days for Mark to earn $10000. Because today is March 5th, 2010, Mark will earn $10000 after 10 days, which is March 15th, 2010. Therefore, the answer is 03/15/2010. + + +Now here is the question: +""" +COT_PROMPT = "Let's think step by step." +COT_EXTRACTION = "Therefore, in MM/DD/YYYY form, the answer is" +REGEX = r"\d\d\/\d\d\/\d\d\d\d" + +DATASET = os.path.abspath(os.path.join(__file__, "../dataset/task-corrected.json")) +TASK = json.load(open(DATASET)) +N = len(TASK["examples"]) + + +def run_gpt(question): + messages = [{"role": "user", "content": question}] + response = openai.ChatCompletion.create( + model="gpt-4", + messages=messages, + temperature=0, + ) + return response["choices"][0]["message"]["content"] + + +def test_tracking(range): + out = {"score": 0, "data": []} + + pbar = tqdm(range) + for i in pbar: + example = TASK["examples"][i] + question = f"Q: {example['input']}\nA: {COT_PROMPT}" + try: + if FEW_SHOT: + response = run_gpt(SHOTS + question) + else: + response = run_gpt(question) + question2 = f"{question} {response}\n{COT_EXTRACTION}" + response2 = run_gpt(question2) + pred = re.findall(REGEX, response2) + pred = pred[0] if len(pred) > 0 else "" + score = int( + pred in example["target_scores"] and example["target_scores"][pred] + ) + out["score"] += score + out["data"] += [ + { + "id": i, + "question": question2, + "response": response2, + "answer": pred, + "score": score, + } + ] + except Exception as e: + out["data"] += [ + {"id": i, "question": question, "exception": str(e), "score": 0} + ] + + pbar.set_postfix({"score": out["score"]}) + + json_object = json.dumps(out.copy(), indent=4) + with open("data.json", "w") as outfile: + outfile.write(json_object) + + +test_tracking(range(N)) diff --git a/experiments/big-bench/date-understanding/test_gpt_scallop.py b/experiments/big-bench/date-understanding/test_gpt_scallop.py new file mode 100644 index 0000000..eb4c987 --- /dev/null +++ b/experiments/big-bench/date-understanding/test_gpt_scallop.py @@ -0,0 +1,95 @@ +import json +from tqdm import tqdm +from datetime import datetime +import scallopy +import scallopy_ext +import os +from io import StringIO +import sys + +DATASET = os.path.abspath(os.path.join(__file__, "../dataset/task-corrected.json")) +TASK = json.load(open(DATASET)) +N = len(TASK["examples"]) +SCALLOP_FILE = os.path.abspath(os.path.join(__file__, "../date-compute.scl")) + + +class Args: + def __init__(self): + self.cuda = False + self.gpu = None + self.num_allowed_openai_request = N + self.openai_gpt_model = "gpt-4" + self.openai_gpt_temperature = 0 + + +def test_date_understanding(range): + out = {"score": 0, "data": [], "logs": []} + + # Configure scallop extension library + scallopy_ext.config.configure(Args()) + + # Setup scallop context + ctx = scallopy.ScallopContext(provenance="unit") + scallopy_ext.extlib.load_extlib(ctx) + ctx.import_file(SCALLOP_FILE) + + # Iterate through al the datapoints + pbar = tqdm(range) + for i in pbar: + example = TASK["examples"][i] + buffer = StringIO() + sys.stdout = buffer + + try: + temp_ctx = ctx.clone() + temp_ctx.add_facts("question", [(example["input"],)]) + temp_ctx.run() + res = [ + datetime.strptime(x[0].split(" ")[0], "%Y-%m-%d").strftime("%m/%d/%Y") + for x in list(temp_ctx.relation("answer")) + ] + score = 0 + final_answer = "" + for answer in example["target_scores"]: + if answer in res: + final_answer = answer + score = example["target_scores"][answer] + break + out["score"] += score + out["data"] += [ + { + "id": i, + "question": example["input"], + "final_answer": final_answer, + "score": score, + "mentioned_date": list(temp_ctx.relation("mentioned_date")), + "relationship": list(temp_ctx.relation("relationship")), + "goal": list(temp_ctx.relation("goal")), + "answer": list(temp_ctx.relation("answer")), + } + ] + except Exception as e: + out["data"] += [ + { + "id": i, + "question": example["input"], + "exception": str(e), + "score": 0, + } + ] + + out["logs"] += [ + { + "id": i, + "log": buffer.getvalue().encode("utf-8").decode("unicode_escape"), + } + ] + + pbar.set_postfix({"score": out["score"]}) + + json_object = json.dumps(out.copy(), indent=2) + with open("data.json", "w") as outfile: + outfile.write(json_object) + + +test_date_understanding(range(N)) diff --git a/experiments/big-bench/object-tracking/readme.md b/experiments/big-bench/object-tracking/readme.md new file mode 100644 index 0000000..5e0a1ce --- /dev/null +++ b/experiments/big-bench/object-tracking/readme.md @@ -0,0 +1,9 @@ +# object tracking +Download and extract the dataset from [this link](https://drive.google.com/file/d/1tpKWhBHSCyA7HE2RoBmpSRprgjqUJa1P/view?usp=drive_link). + +| Model | Score | Accuracy | +| --- | --- | --- | +| Scallop (1-shot) | 150 | **100.00%** | +| GPT-4 (3-shot CoT) | 149 | 99.33% | +| GPT-4 (0-shot CoT) | 126 | 84.00% | +| GPT-4 (0-shot) | 45 | 30.00% | \ No newline at end of file diff --git a/experiments/big-bench/object-tracking/test_gpt_baseline.py b/experiments/big-bench/object-tracking/test_gpt_baseline.py new file mode 100644 index 0000000..3bd7d05 --- /dev/null +++ b/experiments/big-bench/object-tracking/test_gpt_baseline.py @@ -0,0 +1,84 @@ +import openai +import json +from tqdm import tqdm +import random +import os + + +ZERO_SHOT_HEADER = "Respond only with the answer choice." +FEW_SHOT_HEADER = """Respond only with the answer choice. + +Examples: +Q: Alice, Bob, and Claire are playing a game. At the start of the game, they are each holding a ball: Alice has a orange ball, Bob has a white ball, and Claire has a blue ball. \n\nAs the game progresses, pairs of players trade balls. First, Alice and Bob swap balls. Then, Bob and Claire swap balls. Finally, Alice and Bob swap balls. At the end of the game, Alice has the \nAnswer choices: (A) orange ball. (B) white ball. (C) blue ball. +A: (C) blue ball. + +Q: Alice, Bob, Claire, Dave, and Eve are playing a game. At the start of the game, they are each holding a ball: Alice has a pink ball, Bob has a white ball, Claire has a red ball, Dave has a purple ball, and Eve has a yellow ball. \n\nAs the game progresses, pairs of players trade balls. First, Alice and Dave swap balls. Then, Claire and Eve swap balls. Then, Alice and Bob swap balls. Then, Dave and Claire swap balls. Finally, Alice and Claire swap balls. At the end of the game, Alice has the \nAnswer choices: (A) pink ball. (B) white ball. (C) red ball. (D) purple ball. (E) yellow ball. +A: (A) pink ball. + +Q: Alice, Bob, Claire, Dave, Eve, Fred, and Gertrude are playing a game. At the start of the game, they are each holding a ball: Alice has a green ball, Bob has a white ball, Claire has a yellow ball, Dave has a pink ball, Eve has a orange ball, Fred has a black ball, and Gertrude has a brown ball. \n\nAs the game progresses, pairs of players trade balls. First, Bob and Gertrude swap balls. Then, Fred and Claire swap balls. Then, Dave and Gertrude swap balls. Then, Bob and Gertrude swap balls. Then, Alice and Claire swap balls. Then, Gertrude and Claire swap balls. Finally, Eve and Claire swap balls. At the end of the game, Alice has the \nAnswer choices: (A) green ball. (B) white ball. (C) yellow ball. (D) pink ball. (E) orange ball. (F) black ball. (G) brown ball. +A: (F) black ball. + +Answer the following: +""" + +TASK_3 = json.load(os.path.abspath(os.path.join(__file__, "../dataset/task-3-objects.json"))) +TASK_5 = json.load(os.path.abspath(os.path.join(__file__, "../dataset/task-5-objects.json"))) +TASK_7 = json.load(os.path.abspath(os.path.join(__file__, "../dataset/task-7-objects.json"))) +DATASET = TASK_3["examples"] + TASK_5["examples"] + TASK_7["examples"] +N = len(DATASET) +SEED = 10 +SAMPLE_SIZE = 150 + + +def run_gpt(question): + messages = [{"role": "user", "content": question}] + response = openai.ChatCompletion.create( + model="gpt-4", + messages=messages, + temperature=0, + ) + return response["choices"][0]["message"]["content"] + + +def test_tracking(range): + out = {"score": 0, "data": []} + + pbar = tqdm(range) + for i in pbar: + example = DATASET[i] + choices = [] + letter = "A" + target = "" + for choice in example["target_scores"]: + choices.append(f"({letter}) {choice}") + if example["target_scores"][choice]: + target = choice + letter = chr(ord(letter) + 1) + question = f"{ZERO_SHOT_HEADER}\n{example['input']}\nAnswer choices: {' '.join(choices)}" + try: + answer = run_gpt(question) + score = int(target in answer) + out["score"] += score + out["data"] += [ + { + "id": i, + "question": question, + "answer": answer, + "score": score, + } + ] + except Exception as e: + out["data"] += [ + {"id": i, "question": question, "exception": str(e), "score": 0} + ] + + pbar.set_postfix({"score": out["score"]}) + + json_object = json.dumps(out.copy(), indent=4) + with open("data.json", "w") as outfile: + outfile.write(json_object) + + +random.seed(SEED) +sample = random.sample(range(N), SAMPLE_SIZE) +test_tracking(sample) diff --git a/experiments/big-bench/object-tracking/test_gpt_cot.py b/experiments/big-bench/object-tracking/test_gpt_cot.py new file mode 100644 index 0000000..57d028c --- /dev/null +++ b/experiments/big-bench/object-tracking/test_gpt_cot.py @@ -0,0 +1,108 @@ +import openai +import json +import re +from tqdm import tqdm +import random +import os + + +FEW_SHOT = False +SHOTS = """ +Examples: +Q: Alice, Bob, and Claire are playing a game. At the start of the game, they are each holding a ball: Alice has a orange ball, Bob has a white ball, and Claire has a blue ball. \n\nAs the game progresses, pairs of players trade balls. First, Alice and Bob swap balls. Then, Bob and Claire swap balls. Finally, Alice and Bob swap balls. At the end of the game, Alice has the +A: Let's think step by step. +1. Alice and Bob swap balls. Now Alice has the white ball, and Bob has the orange ball. +2. Bob and Claire swap balls. Now Bob has the blue ball, and Claire has the orange ball. +3. Alice and Bob swap balls. Now Alice has the blue ball, and Bob has the white ball. +At the end of the game, Alice has the blue ball. + +Q: Alice, Bob, and Claire are on the same team in a soccer match. At the start of the match, they are each assigned to a position: Alice is playing striker, Bob is playing benchwarmer, and Claire is playing center midfielder. \n\nAs the game progresses, pairs of players occasionally swap positions. First, Alice and Bob trade positions. Then, Alice and Claire trade positions. Finally, Claire and Bob trade positions. At the end of the match, Bob is playing +A: Let's think step by step. +1. Alice and Bob trade positions. Now Alice is benchwarmer, and Bob is striker. +2. Alice and Claire trade positions. Now Alice is center midfielder, and Claire is benchwarmer. +3. Claire and Bob trade positions. Now Bob is benchwarmer, and Claire is striker. +At the end of the match, Bob is playing benchwarmer. + +Q: Alice, Bob, and Claire are dancers at a square dance. At the start of a song, they each have a partner: Alice is dancing with Patrick, Bob is dancing with Karl, and Claire is dancing with Izzi. \n\nThroughout the song, the dancers often trade partners. First, Bob and Claire switch partners. Then, Claire and Alice switch partners. Finally, Claire and Bob switch partners. At the end of the dance, Claire is dancing with +A: Let's think step by step. +1. Bob and Claire switch partners. Now Bob's partner is Izzi, and Claire's partner is Karl. +2. Claire and Alice switch partners. Now Alice's partner is Karl, and Claire's partner is Patrick. +3. Claire and Bob switch partners. Now Bob's partner is Patrick, and Claire's partner is Izzi. +At the end of the dance, Claire is dancing with Izzi. + + +Now here is the question: +""" +COT_PROMPT = "Let's think step by step." +COT_EXTRACTION = "Therefore, among A through G, the answer is" +REGEX = r"A|B|C|D|E|F|G" + +TASK_3 = json.load(os.path.abspath(os.path.join(__file__, "../dataset/task-3-objects.json"))) +TASK_5 = json.load(os.path.abspath(os.path.join(__file__, "../dataset/task-5-objects.json"))) +TASK_7 = json.load(os.path.abspath(os.path.join(__file__, "../dataset/task-7-objects.json"))) +DATASET = TASK_3["examples"] + TASK_5["examples"] + TASK_7["examples"] +N = len(DATASET) +SEED = 10 +SAMPLE_SIZE = 150 + + +def run_gpt(question): + messages = [{"role": "user", "content": question}] + response = openai.ChatCompletion.create( + model="gpt-4", + messages=messages, + temperature=0, + ) + return response["choices"][0]["message"]["content"] + + +def test_tracking(range): + out = {"score": 0, "data": []} + + pbar = tqdm(range) + for i in pbar: + example = DATASET[i] + choices = [] + letter = "A" + target = "" + for choice in example["target_scores"]: + choices.append(f"({letter}) {choice}") + if example["target_scores"][choice]: + target = letter + letter = chr(ord(letter) + 1) + question = f"Q: {example['input']} Answer choices: {' '.join(choices)}\nA: {COT_PROMPT}" + try: + if FEW_SHOT: + response = run_gpt(SHOTS + question) + else: + response = run_gpt(question) + question2 = f"{question} {response}\n{COT_EXTRACTION}" + response2 = run_gpt(question2) + pred = re.findall(REGEX, response2) + pred = pred[0] if len(pred) > 0 else "" + score = int(pred == target) + out["score"] += score + out["data"] += [ + { + "id": i, + "question": question2, + "response": response2, + "answer": pred, + "score": score, + } + ] + except Exception as e: + out["data"] += [ + {"id": i, "question": question, "exception": str(e), "score": 0} + ] + + pbar.set_postfix({"score": out["score"]}) + + json_object = json.dumps(out.copy(), indent=4) + with open("data.json", "w") as outfile: + outfile.write(json_object) + + +random.seed(SEED) +sample = random.sample(range(N), SAMPLE_SIZE) +test_tracking(sample) diff --git a/experiments/big-bench/object-tracking/test_gpt_scallop.py b/experiments/big-bench/object-tracking/test_gpt_scallop.py new file mode 100644 index 0000000..26e9118 --- /dev/null +++ b/experiments/big-bench/object-tracking/test_gpt_scallop.py @@ -0,0 +1,101 @@ +import json +import jsbeautifier +from tqdm import tqdm +import scallopy +import scallopy_ext +from io import StringIO +import sys +import random +import os + +TASK_3 = json.load(os.path.abspath(os.path.join(__file__, "../dataset/task-3-objects.json"))) +TASK_5 = json.load(os.path.abspath(os.path.join(__file__, "../dataset/task-5-objects.json"))) +TASK_7 = json.load(os.path.abspath(os.path.join(__file__, "../dataset/task-7-objects.json"))) +DATASET = TASK_3["examples"] + TASK_5["examples"] + TASK_7["examples"] +N = len(DATASET) +SEED = 10 +SAMPLE_SIZE = 150 +SCALLOP_FILE = "tracking.scl" + + +class Args: + def __init__(self): + self.cuda = False + self.gpu = None + self.num_allowed_openai_request = SAMPLE_SIZE + self.openai_gpt_model = "gpt-4" + self.openai_gpt_temperature = 0 + + +def test_date_understanding(range): + out = {"score": 0, "data": [], "logs": []} + + # Configure scallop extension library + scallopy_ext.config.configure(Args()) + + # Setup scallop context + ctx = scallopy.ScallopContext(provenance="unit") + scallopy_ext.extlib.load_extlib(ctx) + ctx.import_file(SCALLOP_FILE) + + # Iterate through al the datapoints + pbar = tqdm(range) + for i in pbar: + example = DATASET[i] + buffer = StringIO() + sys.stdout = buffer + + try: + temp_ctx = ctx.clone() + temp_ctx.add_facts("question", [(example["input"],)]) + temp_ctx.run() + res = list(temp_ctx.relation("answer"))[0][0] + score = 0 + final_answer = "" + for answer in example["target_scores"]: + if res in answer: + final_answer = answer + score = example["target_scores"][answer] + break + out["score"] += score + out["data"] += [ + { + "id": i, + "question": example["input"], + "final_answer": final_answer, + "score": score, + "possessions": list(temp_ctx.relation("possessions")), + "swaps": list(temp_ctx.relation("swaps")), + "goal": list(temp_ctx.relation("goal")), + "answer": list(temp_ctx.relation("answer")), + } + ] + except Exception as e: + out["data"] += [ + { + "id": i, + "question": example["input"], + "exception": str(e), + "score": 0, + } + ] + + out["logs"] += [ + { + "id": i, + "log": buffer.getvalue().encode("utf-8").decode("unicode_escape"), + } + ] + + pbar.set_postfix({"score": out["score"]}) + + options = jsbeautifier.default_options() + options.indent_size = 2 + json_object = jsbeautifier.beautify(json.dumps(out.copy()), options) + with open("data.json", "w") as outfile: + outfile.write(json_object) + + +random.seed(SEED) +sample = random.sample(range(N), SAMPLE_SIZE) +test_date_understanding(sample[100:]) diff --git a/experiments/big-bench/object-tracking/tracking.scl b/experiments/big-bench/object-tracking/tracking.scl new file mode 100644 index 0000000..a2e6416 --- /dev/null +++ b/experiments/big-bench/object-tracking/tracking.scl @@ -0,0 +1,37 @@ +type question(question: String) +//rel question = {"Anthony, Ben, and Charles are playing a game. At the start of the game, they are each holding a ball: Anthony has a orange ball, Ben has a white ball, and Charles has a blue ball. \n\nAs the game progresses, pairs of players trade balls. Ben and Charles swap balls. But before that, Anthony and Ben swap balls. At the very end, Anthony and Ben swap balls. But at the very beginning of the game, Anthony and Charles swap balls. At the end of the game, Ben has the "} + +@gpt_extract_info( + header="Please extract the relationships from the provided question.", + prompts=[ + "What are the initial possessions in JSON format? (use 'person' and 'object' as fields)", + "What are the swaps in JSON format?", + "Who is the goal in JSON format?" + ], + examples=[ + ( + ["Alice, Bob, and Claire are playing a game. At the start of the game, they are each holding a ball: Alice has a orange ball, Bob has a white ball, and Claire has a blue ball. As the game progresses, pairs of players trade balls. First, Alice and Bob swap balls. Then, Bob and Claire swap balls. Finally, Alice and Bob swap balls. At the end of the game, Alice has the"], + [ + [("Alice", "orange ball"), ("Bob", "white ball"), ("Claire", "blue ball")], + [("1", "Alice", "Bob"), ("2", "Bob", "Claire"), ("3", "Alice", "Bob")], + [("Alice")] + ] + ) + ], + model="gpt-4", + debug=true, +) +type extract_possessions (bound question: String, person: String, object: String), + extract_swaps (bound question: String, time: i32, person_a: String, person_b: String), + extract_goal (bound question: String, goal: String), + +rel possessions(1, person, object) = question(question) and extract_possessions(question, person, object) +rel swaps(time, p1, p2) = question(question) and extract_swaps(question, time, p1, p2) +rel goal(person) = question(question) and extract_goal(question, person) + +rel swaps(time, p1, p2) = swaps(time, p2, p1) +rel possessions(t + 1, p1, object) = swaps(t, p1, p2) and possessions(t, p2, object) +rel possessions(t + 1, p1, object) = swaps(t, _, _) and not swaps(t, p1, _) and possessions(t, p1, object) + +rel final_time(t + 1) = t := max(t: swaps(t, _, _)) +rel answer(object) = goal(person) and possessions(t, person, object) and final_time(t) \ No newline at end of file