From b463b453a7fc486082bcd81a0675ce419389a2ca Mon Sep 17 00:00:00 2001 From: Ziyang Li Date: Fri, 23 Feb 2024 09:21:25 -0500 Subject: [PATCH] Adding clutrr experiments --- experiments/clutrr-v2/dataset_loader.py | 27 +++ experiments/clutrr-v2/kinship.scl | 191 ++++++++++++++++++++++ experiments/clutrr-v2/kinship_baseline.py | 77 +++++++++ experiments/clutrr-v2/kinship_cot.py | 81 +++++++++ experiments/clutrr-v2/kinship_scallop.py | 85 ++++++++++ experiments/clutrr-v2/level_accuracy.py | 56 +++++++ 6 files changed, 517 insertions(+) create mode 100644 experiments/clutrr-v2/dataset_loader.py create mode 100644 experiments/clutrr-v2/kinship.scl create mode 100644 experiments/clutrr-v2/kinship_baseline.py create mode 100644 experiments/clutrr-v2/kinship_cot.py create mode 100644 experiments/clutrr-v2/kinship_scallop.py create mode 100644 experiments/clutrr-v2/level_accuracy.py diff --git a/experiments/clutrr-v2/dataset_loader.py b/experiments/clutrr-v2/dataset_loader.py new file mode 100644 index 0000000..607f9d2 --- /dev/null +++ b/experiments/clutrr-v2/dataset_loader.py @@ -0,0 +1,27 @@ +import os +import csv + +class CLUTRRDataset: + def __init__(self, root=".", dataset="data_089907f8", split="test"): + self.dataset_dir = os.path.join(root, f"CLUTRR/{dataset}/") + self.file_names = [os.path.join(self.dataset_dir, d) for d in os.listdir(self.dataset_dir) if f"_{split}.csv" in d] + self.data = [row for f in self.file_names for row in list(csv.reader(open(f)))[1:]] + + def __len__(self): + return len(self.data) + + def __getitem__(self, i): + # Context is a list of sentences + context = self.data[i][2].strip() + + # Remove square brackets + context = context.replace('[', '') + context = context.replace(']', '') + + # Query is of type (sub, obj) + query_sub_obj = eval(self.data[i][3]) + query = (query_sub_obj[0], query_sub_obj[1]) + + # Answer is one of 20 classes such as daughter, mother, ... + answer = self.data[i][5] + return ((context, query), answer) \ No newline at end of file diff --git a/experiments/clutrr-v2/kinship.scl b/experiments/clutrr-v2/kinship.scl new file mode 100644 index 0000000..794a190 --- /dev/null +++ b/experiments/clutrr-v2/kinship.scl @@ -0,0 +1,191 @@ +type context(ctx: String) + +@gpt_extract_info( + header=""" +In this task, you will be given a question regarding kinships between characters in a story. +Please extract individual kinships mentioned in the story and the characters whose kinship is desired in the question. + +Formally, you should extract two types of information in JSON format: +1. Mentioned kinships. This should be a JSON list covering all the kinships mentioned, where each element +is a JSON object with `p1`, `p2`, and `rela` fields, denoting that `p1` is `p2`'s' `rela`. +Please pay extra attention to the directionality of the relation. For a pair of people, +you should generate two relations, one between p1 and p2 and another between p2 and p1. Please only extract direct information mentioned in the question and avoid +doing any computation except for making sure you get both directions. +2. The query. It should be a JSON object with `p1` and `p2` fields, between which is the +relationship we want to derived. + +Examples: + +Question: Dorothy's brother Michael and her went to get ice cream. Michael is the proud father of the lovely Donald. Who is Dorothy to Donald? +What are the mentioned kinships in JSON format? +[{"p1": "Michael", "p2": "Dorothy", "rela": "brother"}, {"p1": "Dorothy", "p2": "Michael", "rela": "sister"}, {"p1": "Michael", "p2": "Donald", "rela": "father"}, {"p1": "Donald", "p2": "Michael", "rela": "son"}] +Whose kinship do we want to find? +[{"p1": "Dorothy", "p2": "Donald"}] + + +Question: Michael and his daughter Jennifer like to read poems together. Jason is the proud father of the lovely Michael. Who is Jason to Jennifer? +What are the mentioned kinships in JSON format? +[{"p1": "Jennifer", "p2": "Michael", "rela": "daughter"}, {"p1": "Michael", "p2": "Jennifer", "rela": "father"}, {"p1": "Jason", "p2": "Michael", "rela": "father"}, {"p1": "Michael", "p2": "Jason", "rela": "son"}] +Whose kinship do we want to find? +[{"p1": "Jason", "p2": "Jennifer"}] + + +Question: Kevin loves going to plays with his wife Aida. Aida's dad James, however, does not like them at all. Who is James to Kevin? +What are the mentioned kinships in JSON format? +[{"p1": "Aida", "p2": "Kevin", "rela": "wife"}, {"p1": "Kevin", "p2": "Aida", "rela": "husband"}, {"p1": "James", "p2": "Aida", "rela": "father"}, {"p1": "Aida", "p2": "James", "rela": "daughter"}] +Whose kinship do we want to find? +[{"p1": "James", "p2": "Kevin"}] + + +Now, look at the following context. + +Question: {{context}} + """, + prompts=[ + "Now, first give me the kinships mentioned in this question in JSON format", + "Good, now please tell me the two people whose kinship we want to find in JSON format" + ], + model="gpt-4", +) +type extract_kinship(bound context: String, p1: String, p2: String, rela: String), + extract_question(bound context: String, p1: String, p2: String) + +rel kinship(p1, p2, rela) = context(ctx) and extract_kinship(ctx, p1, p2, rela) +rel question(p1, p2) = context(ctx) and extract_question(ctx, p1, p2) + +rel composition = { + ("daughter", "daughter", "granddaughter"), + ("daughter", "sister", "daughter"), + ("daughter", "son", "grandson"), + ("daughter", "aunt", "sister"), + ("daughter", "father", "husband"), + ("daughter", "husband", "son-in-law"), + ("daughter", "brother", "son"), + ("daughter", "mother", "wife"), + ("daughter", "uncle", "brother"), + ("daughter", "grandfather", "father"), + ("daughter", "grandfather", "father-in-law"), + ("daughter", "grandmother", "mother"), + ("daughter", "grandmother", "mother-in-law"), + ("sister", "daughter", "niece"), + ("sister", "sister", "sister"), + ("sister", "son", "nephew"), + ("sister", "aunt", "aunt"), + ("sister", "father", "father"), + ("sister", "brother", "brother"), + // ("sister", "nephew", "nephew"), + // ("sister", "nephew", "son"), + ("sister", "mother", "mother"), + ("sister", "uncle", "uncle"), + ("sister", "grandfather", "grandfather"), + ("sister", "grandmother", "grandmother"), + // ("sister", "niece", "niece"), + // ("sister", "niece", "daughter"), + ("son", "daughter", "granddaughter"), + ("son", "sister", "daughter"), + ("son", "son", "grandson"), + ("son", "aunt", "sister"), + ("son", "father", "husband"), + ("son", "brother", "son"), + ("son", "mother", "wife"), + ("son", "uncle", "brother"), + ("son", "grandfather", "father"), + // ("son", "grandfather", "father-in-law"), + ("son", "wife", "daughter-in-law"), + ("son", "grandmother", "mother"), + // ("son", "grandmother", "mother-in-law"), + ("aunt", "sister", "aunt"), + ("aunt", "father", "grandfather"), + ("aunt", "brother", "uncle"), + ("aunt", "mother", "grandmother"), + ("father", "daughter", "sister"), + ("father", "sister", "aunt"), + ("father", "son", "brother"), + ("father", "father", "grandfather"), + // ("father", "granddaughter", "daughter"), + // ("father", "granddaughter", "niece"), + ("father", "brother", "uncle"), + ("father", "mother", "grandmother"), + ("father", "wife", "mother"), + ("husband", "daughter", "daughter"), + ("husband", "son", "son"), + ("husband", "father", "father-in-law"), + ("husband", "granddaughter", "granddaughter"), + ("husband", "mother", "mother-in-law"), + ("husband", "grandson", "grandson"), + ("granddaughter", "sister", "granddaughter"), + // ("granddaughter", "father", "son"), + // ("granddaughter", "father", "son-in-law"), + ("granddaughter", "brother", "grandson"), + // ("granddaughter", "mother", "daughter"), + // ("granddaughter", "mother", "daughter-in-law"), + // ("granddaughter", "grandfather", "husband"), + // ("granddaughter", "grandmother", "wife"), + ("brother", "daughter", "niece"), + ("brother", "sister", "sister"), + ("brother", "son", "nephew"), + ("brother", "aunt", "aunt"), + ("brother", "father", "father"), + ("brother", "brother", "brother"), + // ("brother", "nephew", "nephew"), + // ("brother", "nephew", "son"), + ("brother", "mother", "mother"), + ("brother", "uncle", "uncle"), + ("brother", "grandfather", "grandfather"), + ("brother", "grandmother", "grandmother"), + // 0.8::("brother", "niece", "niece"), + // 0.8::("brother", "niece", "daughter"), + ("nephew", "sister", "niece"), + // ("nephew", "aunt", "wife"), + // ("nephew", "aunt", "wife"), + // ("nephew", "father", "brother"), + ("nephew", "brother", "nephew"), + // ("nephew", "mother", "sister"), + // ("nephew", "uncle", "brother"), + // ("nephew", "uncle", "husband"), + // ("nephew", "grandfather", "father"), + // ("nephew", "grandmother", "mother"), + ("mother", "daughter", "sister"), + ("mother", "sister", "aunt"), + ("mother", "son", "brother"), + ("mother", "father", "grandfather"), + ("mother", "husband", "father"), + // ("mother", "granddaughter", "daughter"), + // ("mother", "granddaughter", "niece"), + ("mother", "brother", "uncle"), + ("mother", "mother", "grandmother"), + // ("mother", "grandson", "son"), + // ("mother", "grandson", "nephew"), + // ("mother", "son-in-law", "husband"), + ("mother", "father-in-law", "grandfather"), + // ("mother", "daughter-in-law", "wife"), + ("mother", "mother-in-law", "grandmother"), + ("uncle", "sister", "aunt"), + ("uncle", "father", "grandfather"), + ("uncle", "brother", "uncle"), + ("uncle", "mother", "grandmother"), + // ("grandfather", "daughter", "mother"), + // ("grandfather", "daughter", "aunt"), + // ("grandfather", "son", "father"), + // ("grandfather", "son", "uncle"), + ("grandfather", "wife", "grandmother"), + ("wife", "daughter", "daughter"), + ("wife", "son", "son"), + ("wife", "father", "father-in-law"), + ("wife", "granddaughter", "granddaughter"), + ("wife", "mother", "mother-in-law"), + ("wife", "grandson", "grandson"), + ("wife", "son-in-law", "son-in-law"), + ("wife", "father-in-law", "father"), + ("wife", "daughter-in-law", "daughter-in-law"), + ("wife", "mother-in-law", "mother"), + ("grandmother", "husband", "grandfather"), + ("grandson", "sister", "granddaughter"), + // ("grandson", "father", "son"), + // ("grandson", "father", "son-in-law"), + ("grandson", "brother", "grandson"), +} + +rel derived_kinship(p1, p2, rela) = kinship(p1, p2, rela) +rel derived_kinship(p1, p3, r3) = p1 != p3 and derived_kinship(p1, p2, r1) and derived_kinship(p2, p3, r2) and composition(r2, r1, r3) +rel result(r) = question(p1, p2) and derived_kinship(p1, p2, r) diff --git a/experiments/clutrr-v2/kinship_baseline.py b/experiments/clutrr-v2/kinship_baseline.py new file mode 100644 index 0000000..414c88c --- /dev/null +++ b/experiments/clutrr-v2/kinship_baseline.py @@ -0,0 +1,77 @@ +import openai +import json +from tqdm import tqdm +from io import StringIO +import sys + +from dataset_loader import CLUTRRDataset + +TASK = CLUTRRDataset() +N = len(TASK) + +HEADER = ''' +In this task, you will be given a question regarding kinships between characters in a story. +Please output your reasoning as a chain of thought and then output your answer on a new line at the end. + +Here are some examples: + +Question: Dorothy's brother Michael and her went to get ice cream. Michael is the proud father of the lovely Donald. Who is Dorothy to Donald? +Answer: Dorothy is Donald's aunt. In the given scenario, Michael is Dorothy's brother, and Michael is the father of Donald. This makes Dorothy the sister of Donald's father, which means she is Donald's aunt. + +aunt + +Question: Michael and his daughter Jennifer like to read poems together. Jason is the proud father of the lovely Michael. Who is Jason to Jennifer? +Answer: Jason is Jennifer's grandfather. In the given scenario, Michael is Jennifer's father, and he enjoys reading poems with her. It is also mentioned that Jason is the proud father of Michael. Therefore, Jason is Jennifer's grandfather, as he is the father of her father, Michael. + +grandfather + +Question: Kevin loves going to plays with his wife Aida. Aida's dad James, however, does not like them at all. Who is James to Kevin? +Answer: James is Aida's father, and Aida is Kevin's wife. Therefore, James is Kevin's father-in-law. + +father-in-law + +Now, answer the following question: + +\n +Question:\s +''' + +def run_gpt(question): + messages = [{"role": "user", "content": HEADER + question + "\nAnswer:"}] + response = openai.ChatCompletion.create( + model="gpt-4", + messages=messages, + temperature=0, + ) + return response["choices"][0]["message"]["content"] + +def test_kinship(range): + out = {"score": 0, "data": []} + + for i in tqdm(range): + (ctx, query), ans = TASK[i] + input = ctx + " Who is " + query[1] + " to " + query[0] + "?" + + try: + output_ans = run_gpt(input) + final_ans = output_ans.split('\n')[-1] + score = int(final_ans.strip().lower().replace(r'[^a-zA-Z]', '') == ans.strip().lower().replace(r'[^a-zA-Z]', '')) + out["score"] += score + out["data"] += [ + { + "question": input, + "reasoning": output_ans, + "answer": final_ans, + "score": score, + } + ] + except Exception as e: + out["data"] += [ + {"question": input, "exception": str(e), "score": 0} + ] + + json_object = json.dumps(out.copy(), indent=4) + with open("data_baseline.json", "w") as outfile: + outfile.write(json_object) + +test_kinship(range(N)) \ No newline at end of file diff --git a/experiments/clutrr-v2/kinship_cot.py b/experiments/clutrr-v2/kinship_cot.py new file mode 100644 index 0000000..bd8ff48 --- /dev/null +++ b/experiments/clutrr-v2/kinship_cot.py @@ -0,0 +1,81 @@ +import openai +import json +from tqdm import tqdm + +from dataset_loader import CLUTRRDataset + + +FEW_SHOT = True +SHOTS = """ +Examples: +Q: Dorothy's brother Michael and her went to get ice cream. Michael is the proud father of the lovely Donald. Who is Dorothy to Donald? +A: aunt + +Q: Michael and his daughter Jennifer like to read poems together. Jason is the proud father of the lovely Michael. Who is Jason to Jennifer? +A: grandfather + +Q: Kevin loves going to plays with his wife Aida. Aida's dad James, however, does not like them at all. Who is James to Kevin? +A: father-in-law + + +Now here is the question: +""" +COT_PROMPT = "Let's think step by step." +COT_EXTRACTION = "Therefore, in one word, the answer is" + +TASK = CLUTRRDataset() +N = len(TASK) + + +def run_gpt(question): + messages = [{"role": "user", "content": question}] + response = openai.ChatCompletion.create( + model="gpt-4", + messages=messages, + temperature=0, + ) + return response["choices"][0]["message"]["content"] + + +def test_tracking(range): + out = {"score": 0, "data": []} + + pbar = tqdm(range) + for i in pbar: + (ctx, query), ans = TASK[i] + input = ctx + " Who is " + query[1] + " to " + query[0] + "?" + + + question = f"Q: {input}\nA: {COT_PROMPT}" + try: + if FEW_SHOT: + response = run_gpt(SHOTS + question) + else: + response = run_gpt(question) + question2 = f"{question} {response}\n{COT_EXTRACTION}" + response2 = run_gpt(question2) + final_ans = response2.split()[-1] + score = int(final_ans.strip().lower().replace(r'[^a-zA-Z]', '') == ans.strip().lower().replace(r'[^a-zA-Z]', '')) + out["score"] += score + out["data"] += [ + { + "question": input, + "reasoning": response, + "answer": final_ans, + "correct_answer": ans, + "score": score, + } + ] + except Exception as e: + out["data"] += [ + {"question": input, "exception": str(e), "score": 0} + ] + + pbar.set_postfix({"score": out["score"]}) + + json_object = json.dumps(out.copy(), indent=4) + with open("data_cot.json", "w") as outfile: + outfile.write(json_object) + + +test_tracking(range(N)) \ No newline at end of file diff --git a/experiments/clutrr-v2/kinship_scallop.py b/experiments/clutrr-v2/kinship_scallop.py new file mode 100644 index 0000000..17eae7f --- /dev/null +++ b/experiments/clutrr-v2/kinship_scallop.py @@ -0,0 +1,85 @@ +import openai +import json +from tqdm import tqdm +from io import StringIO +import sys + +import scallopy +import scallopy_ext + +from dataset_loader import CLUTRRDataset + +TASK = CLUTRRDataset() +N = len(TASK) +SCALLOP_FILE = "kinship.scl" + +class Args: + def __init__(self): + self.cuda = False + self.gpu = None + self.num_allowed_openai_request = 100 + self.openai_gpt_model = "gpt-4" + self.openai_gpt_temperature = 0 + +def test_kinship(range=range(N)): + out = {"score": 0, "data": [], "logs": []} + + for i in tqdm(range): + (ctx, query), ans = TASK[i] + input = ctx + " Who is " + query[1] + " to " + query[0] + "?" + + buffer = StringIO() + sys.stdout = buffer + try: + ctx = scallopy.ScallopContext(provenance="unit") + scallopy_ext.config.configure(Args(), []) + scallopy_ext.extlib.load_extlib(ctx) + ctx.import_file(SCALLOP_FILE) + ctx.add_facts("context", [(input,)]) + ctx.run() + res = list(ctx.relation("result")) + score = 0 + final_answer = "" + for output_ans, in res: + if output_ans.strip().lower().replace(r'[^a-zA-Z]', '') == ans.strip().lower().replace(r'[^a-zA-Z]', ''): + score = 1 + final_answer = output_ans + out["data"] += [ + { + "id": i, + "question": input, + "final_answer": final_answer, + "score": score, + "mentioned_kinship": list(ctx.relation("kinship")), + "derived_kinship": list(ctx.relation("derived_kinship")), + "query": list(ctx.relation("question")), + "answer": list(ctx.relation("result")), + "num_answers": len(res), + } + ] + out["score"] += score + except Exception as e: + out["data"] += [ + { + "id": i, + "question": input, + "exception": str(e), + "score": 0, + } + ] + + out["logs"] += [ + { + "id": i, + "log": buffer.getvalue().encode("utf-8").decode("unicode_escape"), + } + ] + + json_object = json.dumps(out.copy(), indent=2) + with open("data_scallop.json", "w") as outfile: + outfile.write(json_object) + + sys.stdout = sys.__stdout__ + print(out["score"]) + +test_kinship() \ No newline at end of file diff --git a/experiments/clutrr-v2/level_accuracy.py b/experiments/clutrr-v2/level_accuracy.py new file mode 100644 index 0000000..573ec5f --- /dev/null +++ b/experiments/clutrr-v2/level_accuracy.py @@ -0,0 +1,56 @@ +import os +import csv +import json + +class ModifiedCLUTRRDataset: + def __init__(self, root=".", dataset="data_089907f8", split="test", difficulty=range(2,11)): + self.dataset_dir = os.path.join(root, f"CLUTRR/{dataset}/") + self.file_names = [os.path.join(self.dataset_dir, f"1.{d}_{split}.csv") for d in difficulty] + self.data = [row for f in self.file_names for row in list(csv.reader(open(f)))[1:]] + + def __len__(self): + return len(self.data) + + def __getitem__(self, i): + # Context is a list of sentences + context = self.data[i][2].strip() + + # Remove square brackets + context = context.replace('[', '') + context = context.replace(']', '') + + # Query is of type (sub, obj) + query_sub_obj = eval(self.data[i][3]) + query = (query_sub_obj[0], query_sub_obj[1]) + + # Answer is one of 20 classes such as daughter, mother, ... + answer = self.data[i][5] + return ((context, query), answer) + +with open("data_cot.json") as file: + data = json.load(file) + +datasets = {d: set(ctx + " Who is " + query[1] + " to " + query[0] + "?" for (ctx, query), _ in ModifiedCLUTRRDataset(difficulty=[d])) for d in range(2, 11)} + +# index 1 something to do with matched to multiple, index 0 will hold any that did not match any level +accuracy_mapping = [[0, 0] for _ in range(11)] + +for ex in data["data"]: + matched = False + for d in range(2, 11): + if ex["question"] in datasets[d]: + if matched: + accuracy_mapping[1][ex["score"]] += 1 + accuracy_mapping[d][ex["score"]] += 1 + matched = True + if not matched: + accuracy_mapping[0][ex["score"]] += 1 + +assert(accuracy_mapping[0] == [0, 0]) +assert(accuracy_mapping[1] == [0, 0]) + +for d in range(2, 11): + correct = accuracy_mapping[d][1] + total = len(ModifiedCLUTRRDataset(difficulty = [d])) + assert(correct + accuracy_mapping[d][0] == total) + print(f"Level {d}: {correct} / {total} = {correct / total * 100}%") \ No newline at end of file