Skip to content

Commit b488eda

Browse files
authored
Merge pull request #18 from reworkd/rohan
Tagging improvements
2 parents abad571 + 647c2d1 commit b488eda

File tree

3 files changed

+32
-7
lines changed

3 files changed

+32
-7
lines changed

README.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,12 @@ if __name__ == '__main__':
8484
asyncio.run(main())
8585
```
8686

87+
Keep in mind that Tarsier tags different types of elements differently to help your LLM identify what actions are performable on each element. Specifically:
88+
- `[#ID]`: text-insertable fields (e.g. `textarea`, `input` with textual type)
89+
- `[@ID]`: hyperlinks (`<a>` tags)
90+
- `[$ID]`: other interactable elements (e.g. `button`, `select`)
91+
- `[ID]`: plain text (if you pass `tag_text_elements=True`)
92+
8793
## Local Development
8894

8995
### Setup

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "tarsier"
3-
version = "0.4.4"
3+
version = "0.5.0"
44
description = "Vision utilities for web interaction agents"
55
authors = ["Rohan Pandey", "Adam Watkins", "Asim Shrestha"]
66
readme = "README.md"

tarsier/tag_utils.ts

Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,11 @@ const isInteractable = (el: HTMLElement) =>
2222
(el.tagName.toLowerCase() === "input" && el.type !== "hidden") ||
2323
el.role === "button";
2424

25+
const text_input_types = ["text", "password", "email", "search", "url", "tel", "number"];
2526
const isTextInsertable = (el: HTMLElement) =>
26-
(["input", "textarea"].includes(el.tagName.toLowerCase()));
27+
el.tagName.toLowerCase() === "textarea" ||
28+
((el.tagName.toLowerCase() === "input" &&
29+
text_input_types.includes((el as HTMLInputElement).type)));
2730

2831
const emptyTagWhitelist = ["input", "textarea", "select", "button"];
2932
const isEmpty = (el: HTMLElement) => {
@@ -107,14 +110,27 @@ function getElementXPath(element: HTMLElement | null) {
107110
return iframe_str + "//" + path_parts.join("/");
108111
}
109112

110-
function create_tagged_span(idStr: string) {
113+
function create_tagged_span(idNum: number, el: HTMLElement) {
114+
let idStr: string;
115+
if (isInteractable(el)) {
116+
if (isTextInsertable(el))
117+
idStr = `[#${idNum}]`;
118+
else if (el.tagName.toLowerCase() == 'a')
119+
idStr = `[@${idNum}]`;
120+
else
121+
idStr = `[$${idNum}]`;
122+
} else {
123+
idStr = `[${idNum}]`;
124+
}
125+
111126
let idSpan = document.createElement("span");
112127
idSpan.id = "__tarsier_id";
113128
idSpan.style.all = "inherit";
114129
idSpan.style.display = "inline";
115130
idSpan.style.color = "white";
116131
idSpan.style.backgroundColor = "red";
117132
idSpan.textContent = idStr;
133+
118134
return idSpan;
119135
}
120136

@@ -183,17 +199,20 @@ window.tagifyWebpage = (tagLeafTexts = false) => {
183199
continue;
184200
}
185201

186-
const idStr = isTextInsertable(el) ? `{${idNum}}` : `[${idNum}]`;
187-
let idSpan = create_tagged_span(idStr);
202+
let idSpan = create_tagged_span(idNum, el);
188203

189204
if (isInteractable(el)) {
190-
el.prepend(idSpan);
205+
if (isTextInsertable(el) && el.parentElement) {
206+
el.parentElement.insertBefore(idSpan, el);
207+
} else {
208+
el.prepend(idSpan);
209+
}
191210
idNum++;
192211
} else if (tagLeafTexts) {
193212
for (let child of Array.from(el.childNodes)) {
194213
if (child.nodeType === Node.TEXT_NODE && /\S/.test(child.textContent || "")) {
195214
// This is a text node with non-whitespace text
196-
let idSpan = create_tagged_span(idStr);
215+
let idSpan = create_tagged_span(idNum, el);
197216
el.insertBefore(idSpan, child);
198217
idNum++;
199218
}

0 commit comments

Comments
 (0)