Recall/_index at main · amareshhebbar/Recall · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114

import asyncio
import logging
from typing import TYPE_CHECKING
from typing import Any

from crawlee.crawlers import (
    PlaywrightCrawler,
    PlaywrightCrawlingContext,
    PlaywrightPostNavCrawlingContext,
    PlaywrightPreNavCrawlingContext,
)
from crawlee.errors import SessionError
from crawlee.browsers import BrowserPool, PlaywrightBrowserPlugin, CrawleePage
from crawlee.proxy_configuration import ProxyInfo
from crawlee.storages import KeyValueStore

if TYPE_CHECKING:
    from crawlee.browsers._browser_controller import BrowserController

# Setup basic logging to see the output
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

async def main() -> None:
    # 1. Define specific browser plugins
    plugin_chromium = PlaywrightBrowserPlugin(
        browser_type='chromium',
        max_open_pages_per_browser=1,
        # Move launch options here
        browser_launch_options={'slow_mo': 100},
        # Move context options here
        browser_new_context_options={
            'color_scheme': 'dark',
            'user_agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Crawlee/1.0',
        }
    )

    plugin_firefox = PlaywrightBrowserPlugin(
        browser_type='firefox',
        max_open_pages_per_browser=1,
        browser_launch_options={'slow_mo': 100},
        browser_new_context_options={
            'color_scheme': 'dark',
            'user_agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Crawlee/1.0',
        }
    )

    # 2. Configure the Browser Pool with your hooks
    browser_pool = BrowserPool(plugins=[plugin_chromium, plugin_firefox])

    @browser_pool.pre_page_create_hook
    async def log_page_init(page_id: str, *_args: Any) -> None:
        logger.info(f'--- Initializing page {page_id} ---')

    @browser_pool.post_page_create_hook
    async def set_viewport(crawlee_page: CrawleePage, *args: Any) -> None:
        await crawlee_page.page.set_viewport_size({'width': 1280, 'height': 1024})

    # 3. Initialize the Crawler once with all settings
    crawler = PlaywrightCrawler(
        browser_pool=browser_pool,
        max_requests_per_crawl=20,
        # No browser arguments here!
    )

    # 4. Define the Request Handler (The "Brain")
    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext) -> None:
        context.log.info(f'Processing: {context.request.url}')

        page_text = await context.page.locator('body').inner_text()

        # Clean up the text (remove excessive newlines)
        clean_text = " ".join(page_text.split())

        context.log.info(f'Extracted {len(clean_text)} characters from {context.request.url}')

        # Store the extracted text in the default Key-Value Store
        # We use the URL as a partial key (sanitized)
        filename = context.request.url.replace('https://', '').replace('/', '_')
        await context.push_data({
            'url': context.request.url,
            'text_content': clean_text[:500] + "..." # Storing snippet in dataset, or full text below
        })

        # Also save full text to KeyValueStore for easy access
        kvs = await KeyValueStore.open()
        await kvs.set_value(f'text-{filename}', clean_text)

    # 5. Hooks for navigation and security
    @crawler.pre_navigation_hook
    async def configure_page(context: PlaywrightPreNavCrawlingContext) -> None:
        # Example: block images/css to speed up text extraction
        # await context.block_requests(url_patterns=['.jpg', '.png', '.css'])
        context.log.info(f'Starting navigation to {context.request.url}')

    @crawler.post_navigation_hook
    async def custom_captcha_check(context: PlaywrightPostNavCrawlingContext) -> None:
        if await context.page.locator('input[name="captcha"]').first.is_visible():
            raise SessionError(f'Captcha detected on {context.request.url}')

    # 6. Run the crawler with your 4 target websites
    target_urls = [
        'https://crawlee.dev',
        'https://pypi.org/project/crawlee/',
        'https://www.wikipedia.org',
        'https://www.python.org'
    ]

    await crawler.run(target_urls)

if __name__ == '__main__':
    asyncio.run(main())