-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
120 lines (99 loc) · 4.17 KB
/
main.py
File metadata and controls
120 lines (99 loc) · 4.17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
入口脚本:负责流程协调
- 弹出输入框获取 URL(和可选 selector)
- 调用 fetcher 抓取图片 URL 列表与标题
- 准备 canonical 列表与哈希
- 调用 recognizer 完成下载/匹配/归档并生成报告
"""
import os
import datetime as dt
from fetcher import get_url_and_selector_from_inputbox, fetch_image_urls_and_title
from recognizer import (
ensure_dirs,
compute_hashes_from_pil,
process_new_images,
)
# ========== 可调整的全局配置 ==========
CANON_DIR = "images" # 规范文件名(QQ 机器人使用)的图片目录
ARCHIVE_DIR = "archive" # 归档保存目录(archive/<title>/...)
LOG_DIR = "logs" # 日志/报告目录
# 默认 selector(可在弹窗中修改或留空)
IMG_SELECTOR_DEFAULT = "" # "" or "#js_content img"
# 图片属性优先级(懒加载网站常用)
USE_ATTRS_IN_ORDER = ["data-src", "data-original-src", "src"]
# ORB 相似度阈值(范围 0.0 - 1.0,越大越严格)
SIM_THRESHOLD = 0.8
# =======================================
def main():
# 确保基础目录存在
ensure_dirs([CANON_DIR, ARCHIVE_DIR, LOG_DIR])
# 弹窗获取页面 URL 与可选的 CSS 选择器
page_url, selector = get_url_and_selector_from_inputbox(default_selector=IMG_SELECTOR_DEFAULT)
if not page_url:
print("未输入网址,脚本退出。")
return
# 处理 selector:空字符串表示抓取页面上所有 <img>
img_selector = selector if (selector is not None and selector.strip() != "") else ""
try:
# 抓取图片 URL 列表与文章标题(抓取逻辑在 fetcher.py)
urls, page_title = fetch_image_urls_and_title(page_url, img_selector, USE_ATTRS_IN_ORDER)
except Exception as e:
print(f"[error] 抓取页面失败: {e}")
return
if not urls:
print("[warn] 未抓到图片 URL,请检查选择器或页面结构。")
return
# 构建安全的归档文件夹名(简单移除非法字符)
title_folder = page_title.strip() if page_title else dt.date.today().isoformat()
title_folder = "".join(ch for ch in title_folder if ch not in "\\/:*?\"<>|").strip()
if not title_folder:
title_folder = dt.date.today().isoformat()
print(f"[info] page title -> archive folder: {title_folder}")
print(f"[info] total images found: {len(urls)}")
# ========== 加载 canonical 列表 ==========
canon_files = []
for f in sorted(os.listdir(CANON_DIR)):
low = f.lower()
if not low.endswith((".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".tiff")):
# 忽略非图片文件(你提到 images 中会有其他文件)
continue
p = os.path.join(CANON_DIR, f)
try:
# quick verify:确保 Pillow 能打开(避免把脚本或其它非图片当作图片)
with __import__("PIL").Image.open(p) as im:
im.verify()
canon_files.append(f)
except Exception:
# 跳过无法识别的文件
continue
if not canon_files:
print(f"[warn] 未在 {CANON_DIR} 找到 canonical 图片,请先把基准图放好再运行。")
return
# 预计算 canonical 的哈希(用于快速粗筛)
canon_hashes = {}
for cf in canon_files:
try:
with __import__("PIL").Image.open(os.path.join(CANON_DIR, cf)).convert("RGB") as im:
canon_hashes[cf] = compute_hashes_from_pil(im)
except Exception:
# 若某张 canonical 无法读取则跳过
continue
# 调用识别模块执行后续流程(下载、匹配、存档、写报告)
changed_items, unmatched_new_list, unmatched_canon, report_path = process_new_images(
urls=urls,
page_url=page_url,
title_folder=title_folder,
canon_files=canon_files,
canon_hashes=canon_hashes,
canon_dir=CANON_DIR,
archive_dir=ARCHIVE_DIR,
log_dir=LOG_DIR,
use_attrs_in_order=USE_ATTRS_IN_ORDER,
sim_threshold=SIM_THRESHOLD,
)
print(f"[report] {report_path}")
print("[done]")
if __name__ == "__main__":
main()