ImageHostUpdater/main.py at main · MskTmi/ImageHostUpdater · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
入口脚本：负责流程协调
- 弹出输入框获取 URL（和可选 selector）
- 调用 fetcher 抓取图片 URL 列表与标题
- 准备 canonical 列表与哈希
- 调用 recognizer 完成下载/匹配/归档并生成报告
"""

import os
import datetime as dt

from fetcher import get_url_and_selector_from_inputbox, fetch_image_urls_and_title
from recognizer import (
    ensure_dirs,
    compute_hashes_from_pil,
    process_new_images,
)

# ========== 可调整的全局配置 ==========
CANON_DIR = "images"      # 规范文件名（QQ 机器人使用）的图片目录
ARCHIVE_DIR = "archive"   # 归档保存目录（archive/<title>/...）
LOG_DIR = "logs"          # 日志/报告目录

# 默认 selector（可在弹窗中修改或留空）
IMG_SELECTOR_DEFAULT = ""  # "" or "#js_content img"

# 图片属性优先级（懒加载网站常用）
USE_ATTRS_IN_ORDER = ["data-src", "data-original-src", "src"]

# ORB 相似度阈值（范围 0.0 - 1.0，越大越严格）
SIM_THRESHOLD = 0.8
# =======================================


def main():
    # 确保基础目录存在
    ensure_dirs([CANON_DIR, ARCHIVE_DIR, LOG_DIR])

    # 弹窗获取页面 URL 与可选的 CSS 选择器
    page_url, selector = get_url_and_selector_from_inputbox(default_selector=IMG_SELECTOR_DEFAULT)
    if not page_url:
        print("未输入网址，脚本退出。")
        return

    # 处理 selector：空字符串表示抓取页面上所有 <img>
    img_selector = selector if (selector is not None and selector.strip() != "") else ""

    try:
        # 抓取图片 URL 列表与文章标题（抓取逻辑在 fetcher.py）
        urls, page_title = fetch_image_urls_and_title(page_url, img_selector, USE_ATTRS_IN_ORDER)
    except Exception as e:
        print(f"[error] 抓取页面失败: {e}")
        return

    if not urls:
        print("[warn] 未抓到图片 URL，请检查选择器或页面结构。")
        return

    # 构建安全的归档文件夹名（简单移除非法字符）
    title_folder = page_title.strip() if page_title else dt.date.today().isoformat()
    title_folder = "".join(ch for ch in title_folder if ch not in "\\/:*?\"<>|").strip()
    if not title_folder:
        title_folder = dt.date.today().isoformat()

    print(f"[info] page title -> archive folder: {title_folder}")
    print(f"[info] total images found: {len(urls)}")

    # ========== 加载 canonical 列表 ==========
    canon_files = []
    for f in sorted(os.listdir(CANON_DIR)):
        low = f.lower()
        if not low.endswith((".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".tiff")):
            # 忽略非图片文件（你提到 images 中会有其他文件）
            continue
        p = os.path.join(CANON_DIR, f)
        try:
            # quick verify：确保 Pillow 能打开（避免把脚本或其它非图片当作图片）
            with __import__("PIL").Image.open(p) as im:
                im.verify()
            canon_files.append(f)
        except Exception:
            # 跳过无法识别的文件
            continue

    if not canon_files:
        print(f"[warn] 未在 {CANON_DIR} 找到 canonical 图片，请先把基准图放好再运行。")
        return

    # 预计算 canonical 的哈希（用于快速粗筛）
    canon_hashes = {}
    for cf in canon_files:
        try:
            with __import__("PIL").Image.open(os.path.join(CANON_DIR, cf)).convert("RGB") as im:
                canon_hashes[cf] = compute_hashes_from_pil(im)
        except Exception:
            # 若某张 canonical 无法读取则跳过
            continue

    # 调用识别模块执行后续流程（下载、匹配、存档、写报告）
    changed_items, unmatched_new_list, unmatched_canon, report_path = process_new_images(
        urls=urls,
        page_url=page_url,
        title_folder=title_folder,
        canon_files=canon_files,
        canon_hashes=canon_hashes,
        canon_dir=CANON_DIR,
        archive_dir=ARCHIVE_DIR,
        log_dir=LOG_DIR,
        use_attrs_in_order=USE_ATTRS_IN_ORDER,
        sim_threshold=SIM_THRESHOLD,
    )

    print(f"[report] {report_path}")
    print("[done]")


if __name__ == "__main__":
    main()