ImageHostUpdater/recognizer.py at main · MskTmi/ImageHostUpdater · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
识别模块：
- 提供哈希计算（aHash/pHash/dHash）
- 提供 ORB 特征匹配（更精确）
- 下载图片、按命名规则保存到 archive/<title>/（直接使用 canonical 文件名或带后缀 _1/_2）
- 生成报告 logs/report_*.md
"""

import os
import io
import hashlib
import datetime as dt
from urllib.parse import urlparse

import requests
from PIL import Image
import imagehash
import cv2
import numpy as np

# -------------------- 哈希与相似度函数 --------------------
def compute_hashes_from_pil(img: Image.Image):
    """
    计算并返回三种感知哈希（用于快速粗筛）
    返回 dict: { 'ahash':..., 'phash':..., 'dhash':... }
    """
    return {
        "ahash": imagehash.average_hash(img),
        "phash": imagehash.phash(img),
        "dhash": imagehash.dhash(img),
    }


def hash_distance(h1, h2):
    """
    计算两个哈希集合的“距离”——越小代表越相似
    这里简单把三种哈希的汉明距离累加
    """
    return sum(abs(h1[k] - h2[k]) for k in h1)


def orb_similarity_bytes(new_bytes, canon_path):
    """
    使用 OpenCV ORB 特征进行精校验：
    - new_bytes: 新图的字节
    - canon_path: canonical 图片在磁盘上的路径（用于读取做匹配）
    返回相似度分数：good_matches / total_matches，范围 0.0 - 1.0
    """
    try:
        arr = np.frombuffer(new_bytes, np.uint8)
        img1 = cv2.imdecode(arr, cv2.IMREAD_GRAYSCALE)
        img2 = cv2.imread(canon_path, cv2.IMREAD_GRAYSCALE)
        if img1 is None or img2 is None:
            return 0.0
        orb = cv2.ORB_create(500)
        kp1, des1 = orb.detectAndCompute(img1, None)
        kp2, des2 = orb.detectAndCompute(img2, None)
        if des1 is None or des2 is None:
            return 0.0
        bf = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True)
        matches = bf.match(des1, des2)
        if not matches:
            return 0.0
        # 认为 distance < 50 的匹配是“较好匹配”
        good = [m for m in matches if m.distance < 50]
        return float(len(good)) / float(len(matches))
    except Exception:
        return 0.0


# -------------------- 工具函数 --------------------
def sha256_bytes(b: bytes):
    return hashlib.sha256(b).hexdigest()


def download_bytes(url: str, referer=None, headers=None) -> bytes:
    """
    简单下载函数（含 Referer），超时与异常会抛出给调用方处理
    """
    if headers is None:
        headers = {"User-Agent": "Mozilla/5.0", "Referer": referer or url}
    else:
        headers = headers.copy()
        headers["Referer"] = headers.get("Referer") or referer or url
    r = requests.get(url, headers=headers, timeout=60)
    r.raise_for_status()
    return r.content


def ensure_dirs(paths):
    for p in paths:
        os.makedirs(p, exist_ok=True)


def ext_from_bytes_or_url(new_bytes: bytes, url: str, pil_img: Image.Image = None):
    """
    推断图片扩展名，先尝试 PIL image.format，再回退到 URL 后缀
    返回像 ".jpg" 这样的带点小写扩展名（若为 .jpeg 则归一为 .jpg）
    """
    try:
        if pil_img is not None and pil_img.format:
            fmt = pil_img.format.lower()
            if fmt == "jpeg":
                return ".jpg"
            return f".{fmt}"
    except Exception:
        pass
    path = urlparse(url).path
    ext = os.path.splitext(path)[1].lower()
    if ext in [".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".tiff"]:
        if ext == ".jpeg":
            return ".jpg"
        return ext
    return ".png"


def normalize_ext(ext: str) -> str:
    if not ext:
        return ""
    ext = ext.lower()
    if ext == ".jpeg":
        return ".jpg"
    return ext


# -------------------- 保存与报告 --------------------
def save_variant_archive(title_folder: str, canon_basename: str, new_bytes: bytes, ext: str, counter: int, archive_dir: str):
    """
    按你的命名规则保存：archive/<title>/<canonname>（或 _1/_2）
    - counter == 0 -> 保存为 stem + ext（如 Vicissitude_Branch.jpg）
    - counter >=1 -> 保存为 stem_counter + ext（如 Vicissitude_Branch_1.jpg）
    若已存在同名文件，会在末尾再追加序号避免覆盖。
    返回 (path, filename)
    """
    folder = os.path.join(archive_dir, title_folder)
    os.makedirs(folder, exist_ok=True)
    stem, _ = os.path.splitext(canon_basename)
    if counter == 0:
        filename = f"{stem}{ext}"
    else:
        filename = f"{stem}_{counter}{ext}"
    path = os.path.join(folder, filename)
    i = 0
    while os.path.exists(path):
        i += 1
        if counter == 0:
            filename = f"{stem}_{i}{ext}"
        else:
            filename = f"{stem}_{counter}_{i}{ext}"
        path = os.path.join(folder, filename)
    with open(path, "wb") as f:
        f.write(new_bytes)
    return path, filename


def save_unmatched_new(title_folder: str, idx: int, new_bytes: bytes, ext: str, archive_dir: str):
    """
    保存未匹配的新图到 archive/<title>/unmatched/
    """
    folder = os.path.join(archive_dir, title_folder, "unmatched")
    os.makedirs(folder, exist_ok=True)
    filename = f"unmatched_{idx}{ext}"
    path = os.path.join(folder, filename)
    i = 0
    while os.path.exists(path):
        i += 1
        filename = f"unmatched_{idx}_{i}{ext}"
        path = os.path.join(folder, filename)
    with open(path, "wb") as f:
        f.write(new_bytes)
    return path, filename


def write_report(changed_items, unmatched_new, unmatched_canon, url_list, title_folder, log_dir, page_url):
    """
    生成 Markdown 报告，包含匹配与未匹配的详细信息，方便人工复核
    """
    os.makedirs(log_dir, exist_ok=True)
    path = os.path.join(log_dir, f"report_{dt.datetime.now().strftime('%Y%m%d_%H%M%S')}.md")
    lines = []
    lines.append(f"# Auto Update Report ({dt.datetime.now().isoformat(timespec='seconds')})")
    lines.append("")
    lines.append(f"- Source page: {page_url}")
    lines.append(f"- Archive folder: {os.path.join('archive', title_folder)}")
    lines.append(f"- Total new images: {len(url_list)}")
    lines.append(f"- Saved variants (matched to canonical): {len(changed_items)}")
    lines.append(f"- Saved unmatched new: {len(unmatched_new)}")
    lines.append(f"- Canonicals with no match this run: {len(unmatched_canon)}")
    lines.append("")
    if changed_items:
        lines.append("## Saved variants")
        for item in changed_items:
            lines.append(f"- `{item['canon']}` <= new[{item['new_idx']}] sim={item['sim']:.2f} url={item['url']} saved={item['archive_name']}")
        lines.append("")
    if unmatched_new:
        lines.append("## Unmatched new images")
        for ni in unmatched_new:
            lines.append(f"- new[{ni['idx']}] url={ni['url']} best_sim={ni['best_sim']:.2f} saved={ni['archive_name']}")
        lines.append("")
    if unmatched_canon:
        lines.append("## Unmatched canon (no new similar image this run)")
        for ci in unmatched_canon:
            lines.append(f"- `{ci}`")
        lines.append("")
    with open(path, "w", encoding="utf-8") as f:
        f.write("\n".join(lines))
    return path


# -------------------- 核心处理函数 --------------------
def process_new_images(
        urls,
        page_url,
        title_folder,
        canon_files,
        canon_hashes,
        canon_dir,
        archive_dir,
        log_dir,
        use_attrs_in_order=None,
        sim_threshold=0.8,
):
    """
    - urls: 待下载的新图片 URL 列表
    - canon_files: canonical 文件名列表（相对 CANON_DIR）
    - canon_hashes: 预计算好的 canonical 哈希表
    - 返回: (changed_items, unmatched_new_list, unmatched_canon, report_path)
    """
    changed_items = []
    unmatched_new_list = []

    # 计数器：统计每个 canonical 已被保存多少个变体（用于 _1/_2 命名）
    counters = {f: 0 for f in canon_files}

    for idx, url in enumerate(urls):
        try:
            b = download_bytes(url, referer=page_url)
        except Exception as e:
            print(f"[warn] 下载失败 {url}: {e}")
            continue

        # 解析为 PIL（用于哈希及格式识别）
        try:
            n_img = Image.open(io.BytesIO(b)).convert("RGB")
        except Exception:
            print(f"[warn] 无法解析图片: {url}")
            continue

        # 计算新图哈希并在 canonical 中找最接近的那一张（粗筛）
        n_hash = compute_hashes_from_pil(n_img)
        best = None
        best_dist = float("inf")
        for cf, c_hash in canon_hashes.items():
            d = hash_distance(n_hash, c_hash)
            if d < best_dist:
                best = cf
                best_dist = d

        if best is None:
            # 理论上不会发生，但保底处理：将该图保存为 unmatched
            ext = ext_from_bytes_or_url(b, url, n_img)
            path, name = save_unmatched_new(title_folder, idx, b, ext, archive_dir)
            unmatched_new_list.append({"idx": idx, "url": url, "best_sim": 0.0, "archive_name": name})
            print(f"[archive] unmatched new[{idx}] -> {name}")
            continue

        # 用 ORB 做精校验（对 best canonical）
        sim = orb_similarity_bytes(b, os.path.join(canon_dir, best))

        # 优先使用 canonical 的扩展名（和 images/ 下的一致），如果 canonical 没有合法扩展再回退
        canon_ext = normalize_ext(os.path.splitext(best)[1].lower())
        allowed_exts = {".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".tiff"}
        if canon_ext and canon_ext in allowed_exts:
            ext = canon_ext
        else:
            ext = ext_from_bytes_or_url(b, url, n_img)

        # 根据 sim_threshold 决定是否认为匹配成功
        if sim >= sim_threshold:
            counters[best] += 1
            archive_path, archive_name = save_variant_archive(title_folder, best, b, ext, counters[best] - 1, archive_dir)
            changed_items.append({
                "canon": best, "new_idx": idx, "sim": sim, "url": url, "archive_name": archive_name
            })
            print(f"[archive] matched new[{idx}] -> {archive_name} (canon={best} sim={sim:.2f})")
        else:
            archive_path, archive_name = save_unmatched_new(title_folder, idx, b, ext, archive_dir)
            unmatched_new_list.append({
                "idx": idx, "url": url, "best_sim": sim, "archive_name": archive_name
            })
            print(f"[archive] unmatched new[{idx}] -> {archive_name}")

    # 哪些 canonical 这次没有任何匹配
    matched_canon_set = set(item["canon"] for item in changed_items)
    unmatched_canon = [cf for cf in canon_files if cf not in matched_canon_set]

    # 生成报告并返回
    report_path = write_report(changed_items, unmatched_new_list, unmatched_canon, urls, title_folder, log_dir, page_url)
    return changed_items, unmatched_new_list, unmatched_canon, report_path