ysl-price-tracker/price_tracker.py at main · bright-kr/ysl-price-tracker · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
"""
Yves Saint Laurent Price Tracker - Bright Data Web Scraper API
----------------------------------------------------
Track prices, availability, and product data from Yves Saint Laurent.

Docs  : https://docs.brightdata.co.kr/scraping-automation/web-scraper-api/overview
Info  : https://brightdata.co.kr/products/insights/price-tracker/ysl
"""

import json
import os
import time

import requests
from dotenv import load_dotenv

load_dotenv()

# ── Configuration ──────────────────────────────────────────────────────────────
API_TOKEN = os.getenv("BRIGHTDATA_API_TOKEN", "")
if not API_TOKEN:
    raise EnvironmentError(
        "BRIGHTDATA_API_TOKEN is not set.\n"
        "1. Get your token at: https://docs.brightdata.co.kr/general/account/account-settings#api-token\n"
        "2. Add it to your .env file: BRIGHTDATA_API_TOKEN=<your_token>"
    )

# Find your Web Scraper ID at: https://brightdata.co.kr/cp/scrapers
# Search for "Yves Saint Laurent" to find the matching Web Scraper (format: gd_xxxxxxxxxx).
DATASET_ID = os.getenv("BRIGHTDATA_DATASET_ID", "")
if not DATASET_ID:
    raise EnvironmentError(
        "BRIGHTDATA_DATASET_ID is not set.\n"
        "1. Navigate to: https://brightdata.co.kr/cp/scrapers\n"
        "2. Search for 'Yves Saint Laurent' and copy the Web Scraper ID.\n"
        "3. Add it to your .env file: BRIGHTDATA_DATASET_ID=<your_scraper_id>"
    )

_BASE = "https://api.brightdata.co.kr/datasets/v3"
_HEADERS = {
    "Authorization": f"Bearer {API_TOKEN}",
    "Content-Type": "application/json",
}


# ── Core API functions ─────────────────────────────────────────────────────────

def trigger_collection(
    inputs: list[dict],
    limit: int | None = None,
    notify: str | None = None,
    include_errors: bool = True,
) -> str:
    """
    Trigger a dataset collection job.

    Args:
        inputs: List of dicts - each with a "url" key (or keyword/category params).
        limit: Max records to return (None = no limit).
        notify: Optional webhook URL called when the snapshot completes.
        include_errors: Include per-record error details in results.

    Returns:
        snapshot_id string, used to poll for results.
    """
    params: dict = {"dataset_id": DATASET_ID, "include_errors": include_errors}
    if limit is not None:
        params["limit_multiple_results"] = limit
    if notify:
        params["notify"] = notify

    response = requests.post(
        f"{_BASE}/trigger",
        headers=_HEADERS,
        params=params,
        json=inputs,
        timeout=60,
    )
    response.raise_for_status()
    snapshot_id: str = response.json()["snapshot_id"]
    print(f"[✓] Collection triggered - snapshot_id: {snapshot_id}")
    return snapshot_id


def get_results(snapshot_id: str, poll_interval: int = 10) -> list[dict]:
    """
    Poll until the snapshot is ready and return all collected records.

    Args:
        snapshot_id: ID returned by trigger_collection().
        poll_interval: Seconds between status checks (default 10).

    Returns:
        List of product record dicts.
    """
    url = f"{_BASE}/snapshot/{snapshot_id}"
    params = {"format": "json"}

    while True:
        resp = requests.get(url, headers=_HEADERS, params=params, timeout=60)
        if resp.status_code == 200:
            data = resp.json()
            # API may return a list directly or {results: [...]}
            records = data if isinstance(data, list) else data.get("results", data)
            print(f"[✓] Snapshot ready - {len(records)} record(s) returned.")
            return records
        elif resp.status_code == 202:
            progress = resp.json().get("status", "processing")
            print(f"[…] Status: {progress} - retrying in {poll_interval}s")
            time.sleep(poll_interval)
        else:
            resp.raise_for_status()


# ── High-level helpers ─────────────────────────────────────────────────────────

def track_prices(urls: list[str], **kwargs) -> list[dict]:
    """
    Collect price data for a list of Yves Saint Laurent product URLs.

    Example:
        results = track_prices([
            "https://www.ysl.com/en/products/sample-product-123456",
        ])
    """
    inputs = [{"url": u} for u in urls]
    snapshot_id = trigger_collection(inputs, **kwargs)
    return get_results(snapshot_id)


def discover_by_keyword(keyword: str, limit: int = 50, **kwargs) -> list[dict]:
    """
    Discover Yves Saint Laurent products by keyword search.

    Example:
        results = discover_by_keyword("wireless headphones", limit=100)
    """
    inputs = [{"keyword": keyword}]
    snapshot_id = trigger_collection(inputs, limit=limit, **kwargs)
    return get_results(snapshot_id)


def discover_by_category(category_url: str, limit: int = 100, **kwargs) -> list[dict]:
    """
    Discover all products from a Yves Saint Laurent category page.

    Example:
        results = discover_by_category("https://ysl.com/s?k=headphones", limit=200)
    """
    inputs = [{"url": category_url}]
    snapshot_id = trigger_collection(inputs, limit=limit, **kwargs)
    return get_results(snapshot_id)


# ── Entry point ────────────────────────────────────────────────────────────────

if __name__ == "__main__":
    # ── Example 1: Track specific product URLs ─────────────────────────────────
    print("\n===== Example 1: Track by URL =====")
    sample_urls = [
        "https://www.ysl.com/en/products/sample-product-123456",
        # Add more Yves Saint Laurent product URLs here
    ]
    results = track_prices(sample_urls)

    output_file = "ysl_prices.json"
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2, ensure_ascii=False)
    print(f"Saved {len(results)} record(s) to {output_file}")

    if results:
        r = results[0]
        title = r.get("title", r.get("name", "N/A"))
        price = r.get("final_price", r.get("price", "N/A"))
        currency = r.get("currency", "")
        available = r.get("in_stock", r.get("availability", "N/A"))
        print(f"  Sample → {title} | {price} {currency} | in_stock={available}")

    # ── Example 2: Keyword search ──────────────────────────────────────────────
    # print("\n===== Example 2: Keyword search =====")
    # kw_results = discover_by_keyword("example product", limit=20)
    # print(f"Found {len(kw_results)} products for keyword search")

    # ── Example 3: Category discovery ─────────────────────────────────────────
    # print("\n===== Example 3: Category browse =====")
    # cat_results = discover_by_category("https://ysl.com/category/sample", limit=50)
    # print(f"Found {len(cat_results)} products from category")