Skip to content

Commit 564410c

Browse files
authored
Merge pull request #409 from xiongjyu/dev-jericho-llm-prior
polish the prompt of action and histroy
2 parents f4aa165 + 168199d commit 564410c

File tree

1 file changed

+16
-16
lines changed

1 file changed

+16
-16
lines changed

zoo/atari/envs/test_qwen_arati_env.py

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -168,9 +168,7 @@ def _build_messages_and_images(self, cur_img, allowed_names: List[str]):
168168
"type": "text",
169169
"text": (
170170
"Environment: Atari Pong (ALE) — two paddles rally a ball.\n"
171-
"Task: You control the RIGHT paddle. Keep your paddle vertically aligned with the ball to return it and avoid conceding.\n"
172-
"Serving rule: when a new point starts and the ball is not yet in play, you must SERVE using FIRE or *_FIRE; "
173-
"during an active rally, do NOT use FIRE actions and instead move appropriately."
171+
"Task: You control the right green paddle. Keep the paddle aligned vertically with the ball to return the ball and avoid losing it. The left paddle will hit the white ball, and you should try to land the white ball in the center of the right green paddle to return the ball and score.\n"
174172
)
175173
})
176174

@@ -193,24 +191,26 @@ def _build_messages_and_images(self, cur_img, allowed_names: List[str]):
193191
"type": "text",
194192
"text": (
195193
f"Available actions (choose exactly one string): {allowed_str}\n"
196-
"Action semantics:\n"
197-
f"{explain_text}\n"
198-
"Heuristic (to guide your choice): if the ball is above your paddle, choose an UP action (RIGHT/RIGHTFIRE when serving); "
199-
"if the ball is below, choose a DOWN action (LEFT/LEFTFIRE when serving); if perfectly aligned and rally is active, NOOP briefly is acceptable."
194+
# "Action semantics:\n"
195+
# f"{explain_text}\n"
196+
"Heuristic (to guide your choice): if the ball is above your paddle, choose an RIGHT action; "
197+
"if the ball is below, choose a LEFT action; if perfectly aligned and rally is active, NOOP briefly is acceptable."
200198
)
201199
})
202-
200+
203201
# 4) 历史交互轨迹(只包含:历史图片 + 当时选择的动作字符串)
204202
if len(self.buffer) > 0:
205-
content.append({"type": "text", "text": "Recent interaction history (most recent first):"})
206-
for tr in list(self.buffer)[::-1]: # 近 -> 远
203+
# 在输出历史之前,先加一句说明
204+
content.append({"type": "text", "text":
205+
f"Now you are at step t. The following shows the previous {len(self.buffer)} steps of history (from most recent to oldest):\n"
206+
})
207+
208+
# 然后再逐条列出历史轨迹
209+
for k, tr in enumerate(list(self.buffer)[::-1], start=1): # k=1 表示 t−1
210+
content.append({"type": "text", "text": f"(t−{k}) observation:"})
207211
content.append({"type": "image", "image": tr.image})
212+
content.append({"type": "text", "text": f"(t−{k}) you chose: {tr.action_str}\n"})
208213
images_for_processor.append(tr.image)
209-
# 再给该状态下我们选过的动作(仅动作字符串)
210-
content.append({
211-
"type": "text",
212-
"text": f"You chose the action: {tr.action_str}"
213-
})
214214

215215
# 5) 输出格式要求(只返回一行 {ACTION: <action_str>})
216216
content.append({
@@ -346,7 +346,7 @@ def record(self, prev_obs: dict, action_id: int, step: int):
346346
policy = QwenPongPolicy(
347347
model_name="/fs-computility/niuyazhe/shared/xiongjyu/model/Qwen2.5-VL-3B-Instruct",
348348
dtype=torch.bfloat16,
349-
history_n=5,
349+
history_n=2,
350350
use_pil=True,
351351
channel_last=config.channel_last,
352352
device=device,

0 commit comments

Comments
 (0)