Close Menu
    Facebook X (Twitter) Instagram
    Articles Stock
    • Home
    • Technology
    • AI
    • Pages
      • About us
      • Contact us
      • Disclaimer For Articles Stock
      • Privacy Policy
      • Terms and Conditions
    Facebook X (Twitter) Instagram
    Articles Stock
    AI

    A Coding Implementation of Crawl4AI for Net Crawling, Markdown Technology, JavaScript Execution, and LLM-Primarily based Structured Extraction

    Naveed AhmadBy Naveed Ahmad15/04/2026Updated:15/04/2026No Comments2 Mins Read
    blog 38


    import subprocess
    import sys
    
    
    print("📦 Putting in system dependencies...")
    subprocess.run(['apt-get', 'update', '-qq'], capture_output=True)
    subprocess.run(['apt-get', 'install', '-y', '-qq',
                   'libnss3', 'libnspr4', 'libatk1.0-0', 'libatk-bridge2.0-0',
                   'libcups2', 'libdrm2', 'libxkbcommon0', 'libxcomposite1',
                   'libxdamage1', 'libxfixes3', 'libxrandr2', 'libgbm1',
                   'libasound2', 'libpango-1.0-0', 'libcairo2'], capture_output=True)
    print("✅ System dependencies put in!")
    
    
    print("n📦 Putting in Python packages...")
    subprocess.run([sys.executable, '-m', 'pip', 'install', '-U', 'crawl4ai', 'nest_asyncio', 'pydantic', '-q'])
    print("✅ Python packages put in!")
    
    
    print("n📦 Putting in Playwright browsers (this will take a minute)...")
    subprocess.run([sys.executable, '-m', 'playwright', 'install', 'chromium'], capture_output=True)
    subprocess.run([sys.executable, '-m', 'playwright', 'install-deps', 'chromium'], capture_output=True)
    print("✅ Playwright browsers put in!")
    
    
    import nest_asyncio
    nest_asyncio.apply()
    
    
    import asyncio
    import json
    from typing import Listing, Optionally available
    from pydantic import BaseModel, Area
    
    
    print("n" + "="*60)
    print("✅ INSTALLATION COMPLETE! Able to crawl!")
    print("="*60)
    
    
    print("n" + "="*60)
    print("📖 PART 2: BASIC CRAWLING")
    print("="*60)
    
    
    from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
    
    
    async def basic_crawl():
       """The best attainable crawl - fetch a webpage and get markdown."""
       print("n🔍 Working primary crawl on instance.com...")
      
       async with AsyncWebCrawler() as crawler:
           consequence = await crawler.arun(url="https://instance.com")
          
           print(f"n✅ Crawl profitable: {consequence.success}")
           print(f"📄 Title: {consequence.metadata.get('title', 'N/A')}")
           print(f"📝 Markdown size: {len(consequence.markdown.raw_markdown)} characters")
           print(f"n--- First 500 chars of markdown ---")
           print(consequence.markdown.raw_markdown[:500])
          
       return consequence
    
    
    consequence = asyncio.run(basic_crawl())
    
    
    print("n" + "="*60)
    print("⚙️ PART 3: CONFIGURED CRAWLING")
    print("="*60)
    
    
    async def configured_crawl():
       """Crawling with customized browser and crawler configurations."""
       print("n🔧 Working configured crawl with customized settings...")
      
       browser_config = BrowserConfig(
           headless=True,
           verbose=True,
           viewport_width=1920,
           viewport_height=1080,
           user_agent="Mozilla/5.0 (Home windows NT 10.0; Win64; x64) AppleWebKit/537.36"
       )
      
       run_config = CrawlerRunConfig(
           cache_mode=CacheMode.BYPASS,
           word_count_threshold=10,
           page_timeout=30000,
           wait_until="networkidle",
           verbose=True
       )
      
       async with AsyncWebCrawler(config=browser_config) as crawler:
           consequence = await crawler.arun(
               url="https://httpbin.org/html",
               config=run_config
           )
          
           print(f"n✅ Success: {consequence.success}")
           print(f"📊 Standing code: {consequence.status_code}")
           print(f"n--- Content material Preview ---")
           print(consequence.markdown.raw_markdown[:400])
          
       return consequence
    
    
    consequence = asyncio.run(configured_crawl())
    
    
    print("n" + "="*60)
    print("📝 PART 4: MARKDOWN GENERATION")
    print("="*60)
    
    
    from crawl4ai.content_filter_strategy import PruningContentFilter, BM25ContentFilter
    from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
    
    
    async def markdown_generation_demo():
       """Demonstrates uncooked vs match markdown with content material filtering."""
       print("n🎯 Demonstrating markdown era methods...")
      
       browser_config = BrowserConfig(headless=True, verbose=False)
      
       run_config = CrawlerRunConfig(
           cache_mode=CacheMode.BYPASS,
           markdown_generator=DefaultMarkdownGenerator(
               content_filter=PruningContentFilter(
                   threshold=0.4,
                   threshold_type="fastened",
                   min_word_threshold=20
               )
           )
       )
      
       async with AsyncWebCrawler(config=browser_config) as crawler:
           consequence = await crawler.arun(
               url="https://en.wikipedia.org/wiki/Web_scraping",
               config=run_config
           )
          
           raw_len = len(consequence.markdown.raw_markdown)
           fit_len = len(consequence.markdown.fit_markdown) if consequence.markdown.fit_markdown else 0
          
           print(f"n📊 Markdown Comparability:")
           print(f"   Uncooked Markdown:  {raw_len:,} characters")
           print(f"   Match Markdown:  {fit_len:,} characters")
           print(f"   Discount:     {((raw_len - fit_len) / raw_len * 100):.1f}%")
          
           print(f"n--- Match Markdown Preview (first 600 chars) ---")
           print(consequence.markdown.fit_markdown[:600] if consequence.markdown.fit_markdown else "N/A")
          
       return consequence
    
    
    consequence = asyncio.run(markdown_generation_demo())



    Source link

    Naveed Ahmad

    Related Posts

    Google Launches ‘Expertise’ in Chrome: Turning Reusable AI Prompts into One-Click on Browser Workflows

    15/04/2026

    Anthropic’s rise is giving some OpenAI traders second ideas

    15/04/2026

    Max Hodak’s Science Corp. is making ready to put its first sensor in a human mind

    15/04/2026
    Leave A Reply Cancel Reply

    Categories
    • AI
    Recent Comments
      Facebook X (Twitter) Instagram Pinterest
      © 2026 ThemeSphere. Designed by ThemeSphere.

      Type above and press Enter to search. Press Esc to cancel.