Construct a Imaginative and prescient-Guided Internet AI Agent with MolmoWeb-4B Utilizing Multimodal Reasoning and Motion Prediction

def parse_click_coords(action_str):
   """
   Extract normalised (x, y) coordinates from a click on motion string.
   e.g., 'click on(0.45, 0.32)' -> (0.45, 0.32)
   Returns None if the motion just isn't a click on.
   """
   match = re.search(r"click on(s*([d.]+)s*,s*([d.]+)s*)", action_str)
   if match:
       return float(match.group(1)), float(match.group(2))
   return None




def parse_action_details(action_str):
   """
   Parse a MolmoWeb motion string right into a structured dict.
   Returns:  {"sort": "click on", "x": 0.45, "y": 0.32}
             {"sort": "goto", "url": "https://..."}
             {"sort": "sort", "textual content": "question textual content"}
             {"sort": "scroll", "path": "down"}
             {"sort": "press", "key": "Enter"}
             {"sort": "send_msg", "message": "The reply is ..."}
             {"sort": "unknown", "uncooked": "..."}
   """
   action_str = action_str.strip()


   m = re.match(r'click on(s*([d.]+)s*,s*([d.]+)s*)', action_str)
   if m:
       return {"sort": "click on", "x": float(m.group(1)), "y": float(m.group(2))}


   m = re.match(r'goto(s*["'](.+?)["']s*)', action_str)
   if m:
       return {"sort": "goto", "url": m.group(1)}


   m = re.match(r'sort(s*["'](.+?)["']s*)', action_str)
   if m:
       return {"sort": "sort", "textual content": m.group(1)}


   m = re.match(r'scroll(s*["']?(up|down)["']?s*)', action_str)
   if m:
       return {"sort": "scroll", "path": m.group(1)}


   m = re.match(r'press(s*["'](.+?)["']s*)', action_str)
   if m:
       return {"sort": "press", "key": m.group(1)}


   m = re.match(r'send_msg(s*["'](.+?)["']s*)', action_str, re.DOTALL)
   if m:
       return {"sort": "send_msg", "message": m.group(1)}


   m = re.match(r'(new_tab|go_back|switch_tab)(s*(d*)s*)', action_str)
   if m:
       outcome = {"sort": m.group(1)}
       if m.group(2):
           outcome["tab"] = int(m.group(2))
       return outcome


   return {"sort": "unknown", "uncooked": action_str}




def visualise_click(picture, action_str, title="MolmoWeb Prediction"):
   """
   Draw the expected click on location on the screenshot and show it.
   Coordinates are normalised (0-1); we convert to pixel area.
   """
   coords = parse_click_coords(action_str)


   fig, ax = plt.subplots(1, 1, figsize=(12, 7))
   ax.imshow(picture)
   ax.set_title(title, fontsize=14)


   if coords:
       x_norm, y_norm = coords
       w, h = picture.dimension
       x_px, y_px = x_norm * w, y_norm * h


       circle = patches.Circle(
           (x_px, y_px), radius=18, linewidth=3,
           edgecolor="crimson", facecolor="none"
       )
       ax.add_patch(circle)
       ax.plot(x_px, y_px, "r+", markersize=20, markeredgewidth=3)


       ax.annotate(
           f"click on({x_norm:.3f}, {y_norm:.3f})",
           (x_px, y_px), xytext=(x_px + 25, y_px - 25),
           fontsize=11, shade="white",
           bbox=dict(boxstyle="spherical,pad=0.3", facecolor="crimson", alpha=0.8),
           arrowprops=dict(arrowstyle="->", shade="crimson", lw=2),
       )
   else:
       ax.textual content(
           0.5, 0.02, f"Motion: {action_str}", remodel=ax.transAxes,
           fontsize=12, ha="heart", shade="white",
           bbox=dict(boxstyle="spherical,pad=0.4", facecolor="blue", alpha=0.8),
       )


   ax.axis("off")
   plt.tight_layout()
   plt.present()




def download_image(url, dimension=(1280, 720)):
   """Obtain a picture from a URL and resize to browser viewport dimensions."""
   response = requests.get(url, timeout=15)
   img = Picture.open(BytesIO(response.content material)).convert("RGB")
   img = img.resize(dimension, Picture.LANCZOS)
   return img




def create_synthetic_webpage(title="Instance Web page", parts=None):
   """
   Create an artificial webpage screenshot for testing.
   'parts' is an inventory of dicts: "enter"
   """
   img = Picture.new("RGB", (1280, 720), shade=(255, 255, 255))
   draw = ImageDraw.Draw(img)


   draw.rectangle([0, 0, 1280, 50], fill=(240, 240, 240))
   draw.rectangle([180, 10, 900, 40], define=(200, 200, 200), width=1, fill="white")
   draw.textual content((200, 16), f"https://www.instance.com", fill=(100, 100, 100))


   for cx in [30, 60, 90]:
       draw.ellipse([cx - 8, 17, cx + 8, 33], fill=(200, 200, 200))


   draw.textual content((50, 70), title, fill="black")


   if parts:
       for el in parts:
           x, y = el["pos"]
           if el["type"] == "button":
               draw.rectangle([x, y, x + 150, y + 35], fill=(66, 133, 244))
               draw.textual content((x + 10, y + 8), el["text"], fill="white")
           elif el["type"] == "enter":
               draw.rectangle([x, y, x + 300, y + 35], define=(180, 180, 180), width=2)
               draw.textual content((x + 10, y + 8), el["text"], fill=(150, 150, 150))
           elif el["type"] == "textual content":
               draw.textual content((x, y), el["text"], fill="black")
           elif el["type"] == "hyperlink":
               draw.textual content((x, y), el["text"], fill=(66, 133, 244))


   return img




print("Helper capabilities outlined efficiently.")




print("n" + "=" * 70)
print("SECTION 5: Single-step inference - clean web page (chilly begin)")
print("=" * 70)
print("The agent begins at about:clean and should determine its first motion.n")


blank_image = Picture.new("RGB", (1280, 720), shade="white")


process = "Go to arxiv.org and discover the most recent paper about Molmo from Ai2"


immediate = build_prompt(
   task_description=process,
   page_url="about:clean",
   page_index=0,
)


print(f"Activity: {process}")
print("Screenshot: clean white picture (about:clean)")
print("Working inference...n")


raw_output = run_inference(immediate, blank_image)


print(f"Uncooked mannequin output:n{raw_output}n")


parsed = parse_thought_and_action(raw_output)
print(f"Thought: {parsed['thought']}")
print(f"Motion:  {parsed['action']}")


action_details = parse_action_details(parsed["action"])
print(f"Parsed:  {action_details}")

Source link

Construct a Imaginative and prescient-Guided Internet AI Agent with MolmoWeb-4B Utilizing Multimodal Reasoning and Motion Prediction

Mercor competitor Deccan AI raises $25M, sources specialists from India

Delve did the safety compliance on LiteLLM, an AI mission hit by malware

The AI expertise hole is right here, says AI firm, and energy customers are pulling forward

Construct a Imaginative and prescient-Guided Internet AI Agent with MolmoWeb-4B Utilizing Multimodal Reasoning and Motion Prediction

Related Posts

Mercor competitor Deccan AI raises $25M, sources specialists from India

Delve did the safety compliance on LiteLLM, an AI mission hit by malware

The AI expertise hole is right here, says AI firm, and energy customers are pulling forward