-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path_index
More file actions
114 lines (93 loc) · 4.14 KB
/
_index
File metadata and controls
114 lines (93 loc) · 4.14 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import asyncio
import logging
from typing import TYPE_CHECKING
from typing import Any
from crawlee.crawlers import (
PlaywrightCrawler,
PlaywrightCrawlingContext,
PlaywrightPostNavCrawlingContext,
PlaywrightPreNavCrawlingContext,
)
from crawlee.errors import SessionError
from crawlee.browsers import BrowserPool, PlaywrightBrowserPlugin, CrawleePage
from crawlee.proxy_configuration import ProxyInfo
from crawlee.storages import KeyValueStore
if TYPE_CHECKING:
from crawlee.browsers._browser_controller import BrowserController
# Setup basic logging to see the output
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
async def main() -> None:
# 1. Define specific browser plugins
plugin_chromium = PlaywrightBrowserPlugin(
browser_type='chromium',
max_open_pages_per_browser=1,
# Move launch options here
browser_launch_options={'slow_mo': 100},
# Move context options here
browser_new_context_options={
'color_scheme': 'dark',
'user_agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Crawlee/1.0',
}
)
plugin_firefox = PlaywrightBrowserPlugin(
browser_type='firefox',
max_open_pages_per_browser=1,
browser_launch_options={'slow_mo': 100},
browser_new_context_options={
'color_scheme': 'dark',
'user_agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Crawlee/1.0',
}
)
# 2. Configure the Browser Pool with your hooks
browser_pool = BrowserPool(plugins=[plugin_chromium, plugin_firefox])
@browser_pool.pre_page_create_hook
async def log_page_init(page_id: str, *_args: Any) -> None:
logger.info(f'--- Initializing page {page_id} ---')
@browser_pool.post_page_create_hook
async def set_viewport(crawlee_page: CrawleePage, *args: Any) -> None:
await crawlee_page.page.set_viewport_size({'width': 1280, 'height': 1024})
# 3. Initialize the Crawler once with all settings
crawler = PlaywrightCrawler(
browser_pool=browser_pool,
max_requests_per_crawl=20,
# No browser arguments here!
)
# 4. Define the Request Handler (The "Brain")
@crawler.router.default_handler
async def request_handler(context: PlaywrightCrawlingContext) -> None:
context.log.info(f'Processing: {context.request.url}')
page_text = await context.page.locator('body').inner_text()
# Clean up the text (remove excessive newlines)
clean_text = " ".join(page_text.split())
context.log.info(f'Extracted {len(clean_text)} characters from {context.request.url}')
# Store the extracted text in the default Key-Value Store
# We use the URL as a partial key (sanitized)
filename = context.request.url.replace('https://', '').replace('/', '_')
await context.push_data({
'url': context.request.url,
'text_content': clean_text[:500] + "..." # Storing snippet in dataset, or full text below
})
# Also save full text to KeyValueStore for easy access
kvs = await KeyValueStore.open()
await kvs.set_value(f'text-{filename}', clean_text)
# 5. Hooks for navigation and security
@crawler.pre_navigation_hook
async def configure_page(context: PlaywrightPreNavCrawlingContext) -> None:
# Example: block images/css to speed up text extraction
# await context.block_requests(url_patterns=['.jpg', '.png', '.css'])
context.log.info(f'Starting navigation to {context.request.url}')
@crawler.post_navigation_hook
async def custom_captcha_check(context: PlaywrightPostNavCrawlingContext) -> None:
if await context.page.locator('input[name="captcha"]').first.is_visible():
raise SessionError(f'Captcha detected on {context.request.url}')
# 6. Run the crawler with your 4 target websites
target_urls = [
'https://crawlee.dev',
'https://pypi.org/project/crawlee/',
'https://www.wikipedia.org',
'https://www.python.org'
]
await crawler.run(target_urls)
if __name__ == '__main__':
asyncio.run(main())