Merge pull request #204 from simjak/simonas/init

MichaelZhouwang · web-flow · commit a6699632b0e7 · 2024-07-18T14:00:22.000+08:00
fix: init fixes to run it
diff --git a/.env.example b/.env.example
diff --git a/.gitignore b/.gitignore
@@ -122,6 +122,7 @@ celerybeat.pid
 # Environments
 .env
 .venv
+.venv*
 env/
 venv/
 ENV/
@@ -171,3 +172,4 @@ test/
 memory/
 examples/document-processing/logs/
 !src/agents/datasets/data/math/test
+client.db
diff --git a/examples/chatbot/run.py b/examples/chatbot/run.py
@@ -1,10 +1,18 @@
 import os
-from agents import SolutionConfig, Solution
+
 import litellm
+from agents import Solution, SolutionConfig
 
+from dotenv import load_dotenv
 litellm.set_verbose = True
-os.environ["OPENAI_API_KEY"] = ""
-os.environ["OPENAI_BASE_URL"] = ""
+
+load_dotenv()
+
+# Set Environment Variables
+if os.environ.get("OPENAI_API_KEY") is None:
+    os.environ["OPENAI_API_KEY"] = ""
+if os.environ.get("OPENAI_BASE_URL") is None:
+    os.environ["OPENAI_BASE_URL"] = ""
 
 solution = Solution(config=SolutionConfig("examples/chatbot/config.json"))
 solution.run()
diff --git a/examples/software_dev/configs/SOP.json b/examples/software_dev/configs/SOP.json
@@ -16,7 +16,12 @@
             "begin_role": "Boss",
             "node_description": "Imagine a scenario where the boss has presented a requirement. The architect is tasked with proposing a framework based on this requirement. The leader's role is to provide feedback on the architect's proposal, and another architect will finalize the framework based on the leader's comments.",
             "node_prompt_templates": {
-                "target": "The target program is: <target>{prompt}</target>"
+                "target": "The target program is: <target>{prompt}</target>",
+                "style": "You need to follow the output style: {style}.\n",
+                "task": "The task you need to execute is: {task}.\n",
+                "rule": "The rule you need to follow is: {rule}.\n",
+                "demonstrations": "Here are the demonstrations you can refer to:\n{demonstrations}.\n",
+                "last": "{last}"
             },
             "node_prompt_paddings": {
                 "Boss": {
@@ -151,7 +156,12 @@
             "begin_role": "Boss",
             "node_description": "In this scenario, the boss has provided a requirement. The developer's task is to write code based on the architecture proposed by the architect. The leader evaluates the written code for elegance, readability, and functionality, providing feedback. Another developer makes necessary modifications to the code.",
             "node_prompt_templates": {
-                "target": "The target program is: <target>{prompt}</target>"
+                "target": "The target program is: <target>{prompt}</target>",
+                "style": "You need to follow the output style: {style}.\n",
+                "task": "The task you need to execute is: {task}.\n",
+                "rule": "The rule you need to follow is: {rule}.\n",
+                "demonstrations": "Here are the demonstrations you can refer to:\n{demonstrations}.\n",
+                "last": "{last}"
             },
             "node_prompt_paddings": {
                 "Boss": {
@@ -305,7 +315,12 @@
             "node_description": "In this scenario, the boss has provided a requirement. The debugger simulates a compiler to determine whether the code is runnable and provides feedback. The developer writes code based on the debugger's feedback. The leader evaluates whether the final code meets the boss's requirements and provides feedback for further modifications. The coder writes the final code to a file.",
             "node_prompt_templates": {
                 "target": "The target program is: <target>{prompt}</target>",
-                "finalize code": "Please consider all the messages above and integrate the final project code. Ensures the code is clean, well-formatted. {extract}"
+                "finalize code": "Please consider all the messages above and integrate the final project code. Ensures the code is clean, well-formatted. {extract}",
+                "style": "You need to follow the output style: {style}.\n",
+                "task": "The task you need to execute is: {task}.\n",
+                "rule": "The rule you need to follow is: {rule}.\n",
+                "demonstrations": "Here are the demonstrations you can refer to:\n{demonstrations}.\n",
+                "last": "{last}"
             },
             "node_prompt_paddings": {
                 "Boss": {
@@ -480,7 +495,7 @@
                         "value": {
                             "extract": "Please extract the final project code and put them between <result> and </result>."
                         }
-                    
+
                     }
                 }
             }
diff --git a/examples/software_dev/configs/optimizer_config.json b/examples/software_dev/configs/optimizer_config.json
@@ -22,26 +22,26 @@
             "log_path": "logs/trainer_god"
         },
         "meta_prompt": {
-            "loss_without_score": {
+            "loss": {
                 "order": [
-                    "loss_with_ground_truth"
+                    "loss_with_ground_truth_and_score"
                 ],
                 "extract_key": [
                     "score",
                     "requirement_for_previous"
                 ],
-                "loss_with_ground_truth": "You are a fine-tuner of a large model. I will provide you with some output results from the model and the expected correct results. You need to evaluate these data and provide a score out of 10, please wrap the score using <score></score>. Additionally, please provide some suggestions for modifying the model's output, using <requirement_for_previous></requirement_for_previous> to wrap your suggestions.\n\nHere is the model's output:\n<result>{result}</result>;\n\nThe expected result is:\n<ground_truth>{ground_truth}</ground_truth>\n\nPlease note:\n\nEnsure that the output is wrapped with <score></score> and <requirement_for_previous></requirement_for_previous> respectively.\nThe output should be as consistent as possible with the expected result while being correct. For example, if the expected result is “BUST”, and the model's output is “The women's lifestyle magazine is 'BUST' magazine.”, even though the answer is correct, you should advise the model to be more concise.\nThe standard for a score of 10 is that the model's output is exactly the same as the expected result in a case-insensitive manner, and without any unnecessary content. Even if the model's output is semantically correct, if it includes superfluous content, points should be deducted.",
+                "loss_with_ground_truth_and_score": "You are a fine-tuner of a large model. I will provide you with some output results from the model and the expected correct results. You need to evaluate these data and provide a score out of 10, please wrap the score using <score></score>. Additionally, please provide some suggestions for modifying the model's output, using <requirement_for_previous></requirement_for_previous> to wrap your suggestions.\n\nHere is the model's output:\n<result>{result}</result>;\n\nThe expected result is:\n<ground_truth>{ground_truth}</ground_truth>\n\nPlease note:\n\nEnsure that the output is wrapped with <score></score> and <requirement_for_previous></requirement_for_previous> respectively.\nThe output should be as consistent as possible with the expected result while being correct. For example, if the expected result is “BUST”, and the model's output is “The women's lifestyle magazine is 'BUST' magazine.”, even though the answer is correct, you should advise the model to be more concise.\nThe standard for a score of 10 is that the model's output is exactly the same as the expected result in a case-insensitive manner, and without any unnecessary content. Even if the model's output is semantically correct, if it includes superfluous content, points should be deducted.",
                 "loss_no_gt": "我会给你一些模型的输出结果，你需要对这些数据进行评分，并给出一个10分制的评分，输出时评分请使用<score></score>包裹，此外还请给出一些队模型输出结果的修改建议，请使用<requirement_for_previous></requirement_for_previous>包裹。\n\n如下是你需要处理的数据: {result}。",
                 "loss_no_gt_no_result": "我会给你一些交互记录，你需要对这些记录进行评分，并给出一个10分制的评分，输出时评分请使用<score></score>包裹，此外还请给出一些对模型输出结果的修改建议，请使用<requirement_for_previous></requirement_for_previous>包裹。交互信息如下：{history}。"
             },
-            "loss": {
+            "loss_without_score": {
                 "order": [
-                    "loss_with_ground_truth_and_score"
+                    "loss_with_ground_truth"
                 ],
                 "extract_key": [
                     "requirement_for_previous"
                 ],
-                "loss_with_ground_truth_and_score": "You are a large language model fine-tuner. I will provide you with a model's output and the expected correct result. You need to evaluate it and suggest modifications to the model's output. Please use `<requirement_for_previous></requirement_for_previous>` to enclose your feedback.\n\nBelow is the model's output:\n<result>{result}</result>\n\nThe expected result is:\n<ground_truth>{ground_truth}</ground_truth>\n\nHere is the evaluation score for the model. Your goal is to optimize this score:\n<score>{score}</score>\n\nThe relevant information about this score is as follows:\n<evaluation_info>{score_info}</evaluation_info>\n\nNote:\n1. Ensure that `<requirement_for_previous></requirement_for_previous>` exists and appears once.\n2. If the model's output is satisfactory, you can output <requirement_for_previous>The output is satisfactory, no additional requirements</requirement_for_previous>.\n3. The output should be as close to the expected result as possible while ensuring correctness. For example, if the expected result is \"BUST\" and the model's output is \"The women's lifestyle magazine is 'BUST' magazine.\", even though this answer is correct, you should remind the model to be concise."
+                "loss_with_ground_truth": "You are a large language model fine-tuner. I will provide you with a model's output and the expected correct result. You need to evaluate it and suggest modifications to the model's output. Please use `<requirement_for_previous></requirement_for_previous>` to enclose your feedback.\n\nBelow is the model's output:\n<result>{result}</result>\n\nThe expected result is:\n<ground_truth>{ground_truth}</ground_truth>\n\nHere is the evaluation score for the model. Your goal is to optimize this score:\n<score>{score}</score>\n\nThe relevant information about this score is as follows:\n<evaluation_info>{score_info}</evaluation_info>\n\nNote:\n1. Ensure that `<requirement_for_previous></requirement_for_previous>` exists and appears once.\n2. If the model's output is satisfactory, you can output <requirement_for_previous>The output is satisfactory, no additional requirements</requirement_for_previous>.\n3. The output should be as close to the expected result as possible while ensuring correctness. For example, if the expected result is \"BUST\" and the model's output is \"The women's lifestyle magazine is 'BUST' magazine.\", even though this answer is correct, you should remind the model to be concise."
             }
         }
     },
diff --git a/examples/software_dev/run_train.py b/examples/software_dev/run_train.py
@@ -4,8 +4,10 @@
 import litellm
 from agents.optimization.trainer import Trainer, TrainerConfig
 
-os.environ["OPENAI_API_KEY"] = ""
-os.environ["OPENAI_BASE_URL"] = ""
+if os.environ.get("OPENAI_API_KEY") is None:
+    os.environ["OPENAI_API_KEY"] = ""
+if os.environ.get("OPENAI_BASE_URL") is None:
+    os.environ["OPENAI_BASE_URL"] = ""
 
 litellm.set_verbose = False
 
diff --git a/src/agents/agents/llm.py b/src/agents/agents/llm.py
@@ -15,13 +15,16 @@
 # limitations under the License.
 import os
 import time
-import litellm
-import backoff
 from abc import abstractmethod
 from typing import Union
 
-from ..utils.files import save_logs
+import litellm
+from dotenv import load_dotenv
+
 from ..utils.config import Config
+from ..utils.files import save_logs
+
+load_dotenv()
 
 WAIT_TIME = 20
 
@@ -30,10 +33,14 @@
 def completion_with_backoff(**kwargs):
     litellm.api_key = os.environ["OPENAI_API_KEY"]
     litellm.api_base = os.environ.get("OPENAI_BASE_URL")
+
+    if os.environ.get("OPENAI_API_KEY") is None:
+        raise ValueError("OPENAI_API_KEY is not set")
+
     while True:
         try:
             return litellm.completion(**kwargs)
-        except litellm.OpenAIError as e:
+        except litellm.OpenAIError:
             print(f"Please wait {WAIT_TIME} seconds and resend later ...")
             time.sleep(WAIT_TIME)
 
diff --git a/src/agents/datasets/software_dev.py b/src/agents/datasets/software_dev.py
@@ -39,18 +39,23 @@ def get_case_dict(self, idx: int):
         Returns:
         Dict[str, Any]: A dictionary with case details.
         """
-        return {
-            "case_id": "software_dev_" + str(self.data[idx]["task_id"]),
-            "case_name": self.data[idx]["task_name"],
-            "task_id": "software_dev",
-            "function_ids": "no use now",
-            "KB_id": "no use now",
-            "input": {"input_data": {"prompt": self.data[idx]["prompt"]}},
-            "ground_truth": self.data[idx].get("answer", None),
-            "idx": idx,
-            "metric_name": self.metric_name,
-            "metric_description": self.metric_description,
-        }
+        try:
+            return {
+                "case_id": "software_dev_" + str(self.data[idx]["task_id"]),
+                "case_name": self.data[idx]["task_name"],
+                "task_id": "software_dev",
+                "task_description": self.data[idx]["task_description"] if "task_description" in self.data[idx] else self.data[idx]["prompt"],
+                "function_ids": "no use now",
+                "KB_id": "no use now",
+                "input": {"input_data": {"prompt": self.data[idx]["prompt"]}},
+                "ground_truth": self.data[idx].get("answer", None),
+                "idx": idx,
+                "metric_name": self.metric_name,
+                "metric_description": self.metric_description,
+            }
+        except Exception as e:
+            print(f"Error: {e}, {self.data[idx]}")
+            raise e
 
     def evaluate(self, idx: int, answer: str):
         """
diff --git a/src/agents/evaluation/case.py b/src/agents/evaluation/case.py
@@ -29,34 +29,38 @@ def __init__(self, json_data: dict):
             json_data (dict): The JSON data to initialize the Case object.
         """
         # raw data, it will not be saved when dump
-        self.raw_data = json_data
+        try:
+            self.raw_data = json_data
 
-        self.case_id: str = json_data["case_id"]
-        self.case_name: str = json_data["case_name"]
+            self.case_id: str = json_data["case_id"]
+            self.case_name: str = json_data["case_name"]
 
-        self.task_id: str = json_data["task_id"]
-        self.task_description = json_data["task_description"]
+            self.task_id: str = json_data["task_id"]
+            self.task_description = json_data["task_description"]
 
-        self.function_ids: str = json_data["function_ids"]
-        self.KB_id: str = json_data["KB_id"]
+            self.function_ids: str = json_data["function_ids"]
+            self.KB_id: str = json_data["KB_id"]
 
-        self.input: dict = json_data["input"]
-        self.ground_truth: dict = json_data.get("ground_truth")
+            self.input: dict = json_data["input"]
+            self.ground_truth: dict = json_data.get("ground_truth")
 
-        # fields that not available until they are run
-        self.result: dict = json_data.get("result", {})  # 客户期望的直接的输出结果
-        self.trajectory: Trajectory = Trajectory.load_from_json(
-            json_data.get("trajectory", [])
-        )
+            # fields that not available until they are run
+            self.result: dict = json_data.get("result", {})  # 客户期望的直接的输出结果
+            self.trajectory: Trajectory = Trajectory.load_from_json(
+                json_data.get("trajectory", [])
+            )
 
-        # fields that not available until they are evaluated or optimized
-        self.dataset_eval: DatasetEvaluation = DatasetEvaluation(
-            **json_data.get("dataset_eval", {})
-        )  # Dataset evaluation results
-        self.loss: CaseLoss = CaseLoss(**json_data.get("loss", {}))  # 评估结果
-        self.sop_suggestion: SOPSuggestion = SOPSuggestion(
-            **json_data.get("sop_suggestion", {})
-        )  # Suggestions for SOP optimization
+            # fields that not available until they are evaluated or optimized
+            self.dataset_eval: DatasetEvaluation = DatasetEvaluation(
+                **json_data.get("dataset_eval", {})
+            )  # Dataset evaluation results
+            self.loss: CaseLoss = CaseLoss(**json_data.get("loss", {}))  # 评估结果
+            self.sop_suggestion: SOPSuggestion = SOPSuggestion(
+                **json_data.get("sop_suggestion", {})
+            )  # Suggestions for SOP optimization
+        except Exception as e:
+            print(f"Error: {e}, {json_data}")
+            raise e
 
     @classmethod
     def read_batch_from_json(cls, json_path):
diff --git a/src/agents/evaluation/state.py b/src/agents/evaluation/state.py
@@ -23,9 +23,11 @@ def __init__(
             environment (Environment): The environment associated with this state. A deep copy is made.
         """
         self.node = node
-        self.agent: Agent = copy.deepcopy(agent)  # 这里需要深拷贝，而且是仅仅一个agent
+        # self.agent: Agent = copy.deepcopy(agent)  # 这里需要深拷贝，而且是仅仅一个agent
+        self.agent: Agent = agent
         self.action: Action = action
-        self.environment: Environment = copy.deepcopy(environment)
+        # self.environment: Environment = copy.deepcopy(environment)
+        self.environment: Environment = environment
         self.node_eval = NodeEval(node.node_name, "", "", "", "")
         self.backward: StateBackward = StateBackward()
         self.node_backward: StateBackward = StateBackward()
diff --git a/src/agents/optimization/prompt_formatter.py b/src/agents/optimization/prompt_formatter.py
diff --git a/src/agents/task/node.py b/src/agents/task/node.py