| from typing import Literal | |
| from pydantic import BaseModel, Field | |
| SYSTEM_PROMPT: str = """Imagine you are a robot browsing the web, just like humans. Now you need to complete a task. | |
| In each iteration, you will receive an Observation that includes the last screenshots of a web browser and the current memory of the agent. | |
| You have also information about the step that the agent is trying to achieve to solve the task. | |
| Carefully analyze the visual information to identify what to do, then follow the guidelines to choose the following action. | |
| You should detail your thought (i.e. reasoning steps) before taking the action. | |
| Also detail in the notes field of the action the extracted information relevant to solve the task. | |
| Once you have enough information in the notes to answer the task, return an answer action with the detailed answer in the notes field. | |
| This will be evaluated by an evaluator and should match all the criteria or requirements of the task. | |
| Guidelines: | |
| - store in the notes all the relevant information to solve the task that fulfill the task criteria. Be precise | |
| - Use both the task and the step information to decide what to do | |
| - if you want to write in a text field and the text field already has text, designate the text field by the text it contains and its type | |
| - If there is a cookies notice, always accept all the cookies first | |
| - The observation is the screenshot of the current page and the memory of the agent. | |
| - If you see relevant information on the screenshot to answer the task, add it to the notes field of the action. | |
| - If there is no relevant information on the screenshot to answer the task, add an empty string to the notes field of the action. | |
| - If you see buttons that allow to navigate directly to relevant information, like jump to ... or go to ... , use them to navigate faster. | |
| - In the answer action, give as many details a possible relevant to answering the task. | |
| - if you want to write, don't click before. Directly use the write action | |
| - to write, identify the web element which is type and the text it already contains | |
| - If you want to use a search bar, directly write text in the search bar | |
| - Don't scroll too much. Don't scroll if the number of scrolls is greater than 3 | |
| - Don't scroll if you are at the end of the webpage | |
| - Only refresh if you identify a rate limit problem | |
| - If you are looking for a single flights, click on round-trip to select 'one way' | |
| - Never try to login, enter email or password. If there is a need to login, then go back. | |
| - If you are facing a captcha on a website, try to solve it. | |
| - if you have enough information in the screenshot and in the notes to answer the task, return an answer action with the detailed answer in the notes field | |
| - The current date is {timestamp}. | |
| # <output_json_format> | |
| # ```json | |
| # {output_format} | |
| # ``` | |
| # </output_json_format> | |
| """ | |
| class ClickElementAction(BaseModel): | |
| """Click at absolute coordinates of a web element with its description""" | |
| action: Literal["click_element"] = Field(description="Click at absolute coordinates of a web element") | |
| element: str = Field(description="text description of the element") | |
| x: int = Field(description="The x coordinate, number of pixels from the left edge.") | |
| y: int = Field(description="The y coordinate, number of pixels from the top edge.") | |
| def log(self): | |
| return f"I have clicked on the element '{self.element}' at absolute coordinates {self.x}, {self.y}" | |
| class WriteElementAction(BaseModel): | |
| """Write content at absolute coordinates of a web element identified by its description, then press Enter.""" | |
| action: Literal["write_element_abs"] = Field(description="Write content at absolute coordinates of a web page") | |
| content: str = Field(description="Content to write") | |
| element: str = Field(description="Text description of the element") | |
| x: int = Field(description="The x coordinate, number of pixels from the left edge.") | |
| y: int = Field(description="The y coordinate, number of pixels from the top edge.") | |
| def log(self): | |
| return f"I have written '{self.content}' in the element '{self.element}' at absolute coordinates {self.x}, {self.y}" | |
| class ScrollAction(BaseModel): | |
| """Scroll action with no required element""" | |
| action: Literal["scroll"] = Field(description="Scroll the page or a specific element") | |
| direction: Literal["down", "up", "left", "right"] = Field(description="The direction to scroll in") | |
| def log(self): | |
| return f"I have scrolled {self.direction}" | |
| class GoBackAction(BaseModel): | |
| """Action to navigate back in browser history""" | |
| action: Literal["go_back"] = Field(description="Navigate to the previous page") | |
| def log(self): | |
| return "I have gone back to the previous page" | |
| class RefreshAction(BaseModel): | |
| """Action to refresh the current page""" | |
| action: Literal["refresh"] = Field(description="Refresh the current page") | |
| def log(self): | |
| return "I have refreshed the page" | |
| class GotoAction(BaseModel): | |
| """Action to go to a particular URL""" | |
| action: Literal["goto"] = Field(description="Goto a particular URL") | |
| url: str = Field(description="A url starting with http:// or https://") | |
| def log(self): | |
| return f"I have navigated to the URL {self.url}" | |
| class WaitAction(BaseModel): | |
| """Action to wait for a particular amount of time""" | |
| action: Literal["wait"] = Field(description="Wait for a particular amount of time") | |
| seconds: int = Field(default=2, ge=0, le=10, description="The number of seconds to wait") | |
| def log(self): | |
| return f"I have waited for {self.seconds} seconds" | |
| class RestartAction(BaseModel): | |
| """Restart the task from the beginning.""" | |
| action: Literal["restart"] = "restart" | |
| def log(self): | |
| return "I have restarted the task from the beginning" | |
| class AnswerAction(BaseModel): | |
| """Return a final answer to the task. This is the last action to call in an episode.""" | |
| action: Literal["answer"] = "answer" | |
| content: str = Field(description="The answer content") | |
| def log(self): | |
| return f"I have answered the task with '{self.content}'" | |
| ActionSpace = ( | |
| ClickElementAction | |
| | WriteElementAction | |
| | ScrollAction | |
| | GoBackAction | |
| | RefreshAction | |
| | WaitAction | |
| | RestartAction | |
| | AnswerAction | |
| | GotoAction | |
| ) | |
| class NavigationStep(BaseModel): | |
| note: str = Field( | |
| default="", | |
| description="Task-relevant information extracted from the previous observation. Keep empty if no new info.", | |
| ) | |
| thought: str = Field(description="Reasoning about next steps (<4 lines)") | |
| action: ActionSpace = Field(description="Next action to take") | |
| def get_navigation_prompt(task, image, step=1): | |
| system_prompt = SYSTEM_PROMPT.format( | |
| output_format=NavigationStep.model_json_schema(), | |
| timestamp="2025-06-04 14:16:03", | |
| ) | |
| return [ | |
| { | |
| "role": "system", | |
| "content": [ | |
| {"type": "text", "text": system_prompt}, | |
| ], | |
| }, | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "text", "text": f"<task>\n{task}\n</task>\n"}, | |
| {"type": "text", "text": f"<observation step={step}>\n"}, | |
| {"type": "text", "text": "<screenshot>\n"}, | |
| { | |
| "type": "image", | |
| "image": image, | |
| }, | |
| {"type": "text", "text": "\n</screenshot>\n"}, | |
| {"type": "text", "text": "\n</observation>\n"}, | |
| ], | |
| }, | |
| ] | |