Close Menu
    Facebook X (Twitter) Instagram
    Articles Stock
    • Home
    • Technology
    • AI
    • Pages
      • About us
      • Contact us
      • Disclaimer For Articles Stock
      • Privacy Policy
      • Terms and Conditions
    Facebook X (Twitter) Instagram
    Articles Stock
    AI

    Construct a Imaginative and prescient-Guided Internet AI Agent with MolmoWeb-4B Utilizing Multimodal Reasoning and Motion Prediction

    Naveed AhmadBy Naveed Ahmad26/03/2026Updated:26/03/2026No Comments3 Mins Read
    blog 7


    def parse_click_coords(action_str):
       """
       Extract normalised (x, y) coordinates from a click on motion string.
       e.g., 'click on(0.45, 0.32)' -> (0.45, 0.32)
       Returns None if the motion just isn't a click on.
       """
       match = re.search(r"click on(s*([d.]+)s*,s*([d.]+)s*)", action_str)
       if match:
           return float(match.group(1)), float(match.group(2))
       return None
    
    
    
    
    def parse_action_details(action_str):
       """
       Parse a MolmoWeb motion string right into a structured dict.
       Returns:  {"sort": "click on", "x": 0.45, "y": 0.32}
                 {"sort": "goto", "url": "https://..."}
                 {"sort": "sort", "textual content": "question textual content"}
                 {"sort": "scroll", "path": "down"}
                 {"sort": "press", "key": "Enter"}
                 {"sort": "send_msg", "message": "The reply is ..."}
                 {"sort": "unknown", "uncooked": "..."}
       """
       action_str = action_str.strip()
    
    
       m = re.match(r'click on(s*([d.]+)s*,s*([d.]+)s*)', action_str)
       if m:
           return {"sort": "click on", "x": float(m.group(1)), "y": float(m.group(2))}
    
    
       m = re.match(r'goto(s*["'](.+?)["']s*)', action_str)
       if m:
           return {"sort": "goto", "url": m.group(1)}
    
    
       m = re.match(r'sort(s*["'](.+?)["']s*)', action_str)
       if m:
           return {"sort": "sort", "textual content": m.group(1)}
    
    
       m = re.match(r'scroll(s*["']?(up|down)["']?s*)', action_str)
       if m:
           return {"sort": "scroll", "path": m.group(1)}
    
    
       m = re.match(r'press(s*["'](.+?)["']s*)', action_str)
       if m:
           return {"sort": "press", "key": m.group(1)}
    
    
       m = re.match(r'send_msg(s*["'](.+?)["']s*)', action_str, re.DOTALL)
       if m:
           return {"sort": "send_msg", "message": m.group(1)}
    
    
       m = re.match(r'(new_tab|go_back|switch_tab)(s*(d*)s*)', action_str)
       if m:
           outcome = {"sort": m.group(1)}
           if m.group(2):
               outcome["tab"] = int(m.group(2))
           return outcome
    
    
       return {"sort": "unknown", "uncooked": action_str}
    
    
    
    
    def visualise_click(picture, action_str, title="MolmoWeb Prediction"):
       """
       Draw the expected click on location on the screenshot and show it.
       Coordinates are normalised (0-1); we convert to pixel area.
       """
       coords = parse_click_coords(action_str)
    
    
       fig, ax = plt.subplots(1, 1, figsize=(12, 7))
       ax.imshow(picture)
       ax.set_title(title, fontsize=14)
    
    
       if coords:
           x_norm, y_norm = coords
           w, h = picture.dimension
           x_px, y_px = x_norm * w, y_norm * h
    
    
           circle = patches.Circle(
               (x_px, y_px), radius=18, linewidth=3,
               edgecolor="crimson", facecolor="none"
           )
           ax.add_patch(circle)
           ax.plot(x_px, y_px, "r+", markersize=20, markeredgewidth=3)
    
    
           ax.annotate(
               f"click on({x_norm:.3f}, {y_norm:.3f})",
               (x_px, y_px), xytext=(x_px + 25, y_px - 25),
               fontsize=11, shade="white",
               bbox=dict(boxstyle="spherical,pad=0.3", facecolor="crimson", alpha=0.8),
               arrowprops=dict(arrowstyle="->", shade="crimson", lw=2),
           )
       else:
           ax.textual content(
               0.5, 0.02, f"Motion: {action_str}", remodel=ax.transAxes,
               fontsize=12, ha="heart", shade="white",
               bbox=dict(boxstyle="spherical,pad=0.4", facecolor="blue", alpha=0.8),
           )
    
    
       ax.axis("off")
       plt.tight_layout()
       plt.present()
    
    
    
    
    def download_image(url, dimension=(1280, 720)):
       """Obtain a picture from a URL and resize to browser viewport dimensions."""
       response = requests.get(url, timeout=15)
       img = Picture.open(BytesIO(response.content material)).convert("RGB")
       img = img.resize(dimension, Picture.LANCZOS)
       return img
    
    
    
    
    def create_synthetic_webpage(title="Instance Web page", parts=None):
       """
       Create an artificial webpage screenshot for testing.
       'parts' is an inventory of dicts: "enter"
       """
       img = Picture.new("RGB", (1280, 720), shade=(255, 255, 255))
       draw = ImageDraw.Draw(img)
    
    
       draw.rectangle([0, 0, 1280, 50], fill=(240, 240, 240))
       draw.rectangle([180, 10, 900, 40], define=(200, 200, 200), width=1, fill="white")
       draw.textual content((200, 16), f"https://www.instance.com", fill=(100, 100, 100))
    
    
       for cx in [30, 60, 90]:
           draw.ellipse([cx - 8, 17, cx + 8, 33], fill=(200, 200, 200))
    
    
       draw.textual content((50, 70), title, fill="black")
    
    
       if parts:
           for el in parts:
               x, y = el["pos"]
               if el["type"] == "button":
                   draw.rectangle([x, y, x + 150, y + 35], fill=(66, 133, 244))
                   draw.textual content((x + 10, y + 8), el["text"], fill="white")
               elif el["type"] == "enter":
                   draw.rectangle([x, y, x + 300, y + 35], define=(180, 180, 180), width=2)
                   draw.textual content((x + 10, y + 8), el["text"], fill=(150, 150, 150))
               elif el["type"] == "textual content":
                   draw.textual content((x, y), el["text"], fill="black")
               elif el["type"] == "hyperlink":
                   draw.textual content((x, y), el["text"], fill=(66, 133, 244))
    
    
       return img
    
    
    
    
    print("Helper capabilities outlined efficiently.")
    
    
    
    
    print("n" + "=" * 70)
    print("SECTION 5: Single-step inference - clean web page (chilly begin)")
    print("=" * 70)
    print("The agent begins at about:clean and should determine its first motion.n")
    
    
    blank_image = Picture.new("RGB", (1280, 720), shade="white")
    
    
    process = "Go to arxiv.org and discover the most recent paper about Molmo from Ai2"
    
    
    immediate = build_prompt(
       task_description=process,
       page_url="about:clean",
       page_index=0,
    )
    
    
    print(f"Activity: {process}")
    print("Screenshot: clean white picture (about:clean)")
    print("Working inference...n")
    
    
    raw_output = run_inference(immediate, blank_image)
    
    
    print(f"Uncooked mannequin output:n{raw_output}n")
    
    
    parsed = parse_thought_and_action(raw_output)
    print(f"Thought: {parsed['thought']}")
    print(f"Motion:  {parsed['action']}")
    
    
    action_details = parse_action_details(parsed["action"])
    print(f"Parsed:  {action_details}")



    Source link

    Naveed Ahmad

    Related Posts

    Who’s driving Waymo’s self-driving automobiles? Typically, the police.

    26/03/2026

    Mercor competitor Deccan AI raises $25M, sources specialists from India

    26/03/2026

    Delve did the safety compliance on LiteLLM, an AI mission hit by malware

    26/03/2026
    Leave A Reply Cancel Reply

    Categories
    • AI
    Recent Comments
      Facebook X (Twitter) Instagram Pinterest
      © 2026 ThemeSphere. Designed by ThemeSphere.

      Type above and press Enter to search. Press Esc to cancel.