spider/examples/remote_multimodal_dual_automation.rs at main · spider-rs/spider · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
//! Dual-model multi-round automation example.
//!
//! Shows dual-model routing for multi-round browser automation (not just
//! extraction). A vision model handles the first round and stagnation,
//! while a cheaper text model drives mid-round actions via HTML context.
//!
//! Run with:
//! ```bash
//! OPEN_ROUTER=your-api-key cargo run --example remote_multimodal_dual_automation --features "spider/sync spider/chrome spider/agent_chrome"
//! ```
//!
//! EXAMPLE output
//! === Page Received ===
//! URL: https://books.toscrape.com/
//!
//! === AI Results ===
//! Result 1:
//!   Content: {"extracted":{"title":"A Light in the Attic","price":"£51.77","availability":"In stock (22 available)"},"steps":[{"Click":{"selector":"article.product_pod h3 a"}}]}
//!   Tokens: 3541 prompt + 296 completion = 3837 total (2 LLM calls)
//!
//! === Completed in 5.31s ===

extern crate spider;

use spider::features::automation::{ModelEndpoint, RemoteMultimodalConfigs, VisionRouteMode};
use spider::tokio;
use spider::website::Website;

#[tokio::main]
async fn main() {
    env_logger::init();

    let api_key =
        std::env::var("OPEN_ROUTER").expect("OPEN_ROUTER environment variable must be set");

    // A page that requires multi-round interaction
    let url = "https://books.toscrape.com/";

    // ── Dual-model routing for automation ─────────────────────────────
    //
    // VisionFirst: vision model for rounds 0-1 (screenshot), then text model
    // for stable mid-rounds, upgrading back to vision on stagnation/stuck.
    //
    // You can also use different providers for each model by setting
    // `api_url` and `api_key` on the ModelEndpoint:
    //
    //   ModelEndpoint::new("gpt-4o")
    //       .with_api_url("https://api.openai.com/v1/chat/completions")
    //       .with_api_key("sk-openai-...")
    //
    // Fields left as None inherit from the parent RemoteMultimodalConfigs.

    let mm_config = RemoteMultimodalConfigs::new(
        "https://openrouter.ai/api/v1/chat/completions",
        "qwen/qwen-2.5-vl-72b-instruct",
    )
    .with_api_key(&api_key)
    .with_dual_models(
        ModelEndpoint::new("qwen/qwen-2.5-vl-72b-instruct"), // vision rounds
        ModelEndpoint::new("qwen/qwen-2.5-72b-instruct"),    // text rounds
    )
    .with_vision_route_mode(VisionRouteMode::VisionFirst);

    // ── Multi-round automation config ─────────────────────────────────
    //
    // All cfg fields are now available as `with_*` builders on
    // RemoteMultimodalConfigs, so the entire config can be built
    // in a single fluent chain.
    let mm_config = mm_config
        .with_extra_ai_data(true)
        .with_include_html(true)
        .with_include_title(true)
        .with_include_url(true)
        .with_max_rounds(4)
        .with_request_json_object(true)
        .with_extraction_prompt(
            "Navigate to the first book in the catalog. Extract its title, price, and availability.",
        )
        .with_user_message_extra(
            "Click on the first book link to navigate to its detail page, then extract the book data.",
        );

    // ── Run ───────────────────────────────────────────────────────────
    let mut website: Website = Website::new(url)
        .with_limit(1)
        .with_remote_multimodal(Some(mm_config))
        .build()
        .unwrap();

    let mut rx = website.subscribe(16);

    let join_handle = tokio::spawn(async move {
        while let Ok(page) = rx.recv().await {
            println!("=== Page Received ===");
            println!("URL: {}", page.get_url());

            if let Some(ref ai_data) = page.extra_remote_multimodal_data {
                println!("\n=== AI Results ===");
                for (i, result) in ai_data.iter().enumerate() {
                    println!("Result {}:", i + 1);
                    println!("  Content: {}", result.content_output);
                    if let Some(ref usage) = result.usage {
                        println!(
                            "  Tokens: {} prompt + {} completion = {} total ({} LLM calls)",
                            usage.prompt_tokens,
                            usage.completion_tokens,
                            usage.total_tokens,
                            usage.llm_calls,
                        );
                    }
                }
            }
        }
    });

    let start = tokio::time::Instant::now();
    website.crawl().await;
    website.unsubscribe();
    let _ = join_handle.await;

    println!("\n=== Completed in {:?} ===", start.elapsed());
}