Skip to content

Commit f16d359

Browse files
authored
Merge-build: v2.10.0-beta.3
2 parents ab7ea93 + 78cd969 commit f16d359

128 files changed

Lines changed: 7811 additions & 7159 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,7 @@ custom/start/*
176176
GUI/src/*
177177
!GUI/src/preview_format/
178178
!GUI/src/material_ct.py
179+
!docs/.vitepress/
179180
*.db*
180181
# cgs/build
181182
*.ico
@@ -206,3 +207,4 @@ deploy/launcher/mac/*.html
206207
openspec/
207208
nul
208209
.omc
210+
*.*ai

CGS.py

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -59,15 +59,7 @@ def handle_current_exception(self, phase):
5959

6060
def handle_exception(self, exc_type, exc_value, exc_traceback, phase):
6161
if self.ui is not None:
62-
try:
63-
self.ui.hook_exception(exc_type, exc_value, exc_traceback)
64-
return
65-
except Exception:
66-
trace_text = "".join(traceback.format_exception(*sys.exc_info()))
67-
log_path = self._append_fatal_log("hook_exception failed", trace_text)
68-
self._write_stderr(f"\n[CGS hook_exception failed] log: {log_path}\n{trace_text}\n")
69-
return sys.__excepthook__(*sys.exc_info())
70-
62+
return self.ui.hook_exception(exc_type, exc_value, exc_traceback)
7163
trace_text = "".join(traceback.format_exception(exc_type, exc_value, exc_traceback))
7264
log_path = self._append_fatal_log(phase, trace_text)
7365
self._write_stderr(f"\n[CGS uncaught] log: {log_path}\n{trace_text}\n")

ComicSpider/pipelines.py

Lines changed: 13 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -108,32 +108,23 @@ def file_folder(self, basepath, section, spider, title, item):
108108
spider.tasks_path[uuid_md5] = path
109109
return path
110110

111-
def image_downloaded(self, response, request, info, *, item=None):
112-
spider = info.spider
113-
try:
114-
super(ComicPipeline, self).image_downloaded(response, request, info, item=item)
115-
stats = spider.crawler.stats
116-
self._sync_item_progress(spider, stats, item, count_download_stat=True)
117-
except Exception as e:
118-
spider.logger.error(f'traceback: {str(type(e))}:: {str(e)}')
119-
120111
@staticmethod
121112
def _processed_file_count(stats):
122113
return (
123114
stats.get_value('file_status_count/downloaded', default=0) +
124115
stats.get_value('file_status_count/uptodate', default=0)
125116
)
126117

127-
def _sync_item_progress(self, spider, stats, item, *, count_download_stat):
118+
def _sync_item_progress(self, spider, stats, item):
128119
total = getattr(spider, 'total', 0) or 0
129120
processed = self._processed_file_count(stats)
130121
percent = int((processed / total) * 100) if total else 0
131122
spider.emit(BarProgressEvent(job_id=getattr(spider, '_job_id', None), percent=percent))
132123
task_obj = TaskObj(item.get('uuid_md5'), item.get('page'), item['image_urls'][0])
133-
self._record_task_progress(spider, stats, task_obj, count_download_stat=count_download_stat)
124+
self._record_task_progress(spider, task_obj)
134125

135126
@staticmethod
136-
def _record_task_progress(spider, stats, task_obj, *, count_download_stat=True):
127+
def _record_task_progress(spider, task_obj):
137128
_tasks = spider.tasks[task_obj.taskid]
138129
_tasks.downloaded.append(task_obj)
139130
curr_progress = int(len(_tasks.downloaded) / _tasks.tasks_count * 100)
@@ -150,27 +141,16 @@ def _record_task_progress(spider, stats, task_obj, *, count_download_stat=True):
150141
task_obj=task_obj,
151142
is_new=False,
152143
))
153-
if count_download_stat:
154-
stats.inc_value('image/downloaded')
155-
156-
def media_to_download(self, request: Request, info, *, item=None):
157-
dfd = maybeDeferred(super().media_to_download, request, info, item=item)
158-
159-
def _track_uptodate(file_info):
160-
if (
161-
item is not None and
162-
isinstance(file_info, dict) and
163-
file_info.get('status') == 'uptodate'
164-
):
165-
self._sync_item_progress(info.spider, info.spider.crawler.stats, item, count_download_stat=False)
166-
return file_info
167-
168-
dfd.addCallback(_track_uptodate)
169-
return dfd
170144

171145
def item_completed(self, results, item, info):
172-
_item = super(ComicPipeline, self).item_completed(results, item, info)
173-
return _item
146+
completed_item = super(ComicPipeline, self).item_completed(results, item, info)
147+
if not any(
148+
ok and isinstance(file_info, dict) and file_info.get('status') in {'downloaded', 'uptodate'}
149+
for ok, file_info in results
150+
):
151+
return completed_item
152+
self._sync_item_progress(info.spider, info.spider.crawler.stats, item)
153+
return completed_item
174154

175155

176156
class WnacgComicPipeline(ComicPipeline):
@@ -255,7 +235,8 @@ def _download_via_curl():
255235

256236
def _handle_curl_result(result):
257237
status_code, content = result
258-
return self.media_downloaded(
238+
return maybeDeferred(
239+
self.media_downloaded,
259240
Response(url=request.url,status=status_code,body=content,request=request),
260241
request,info,item=item)
261242

ComicSpider/runtime/thread_runner.py

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ def run(self):
6969
installed_reactor = f"{reactor.__class__.__module__}.{reactor.__class__.__name__}"
7070
s.set("TWISTED_REACTOR", installed_reactor, priority="cmdline")
7171
configure_logging(s)
72+
logging.getLogger("PIL.Image").setLevel(logging.WARNING)
7273
self._runner = CrawlerRunner(s)
7374
self._settings = s
7475
self._ready.set()
@@ -113,11 +114,7 @@ def _start_crawl(self, job: SpiderDownloadJob):
113114
self.state.update(stage="crawling", active_job_id=job.job_id, error=None)
114115
self.event_q.put(JobAcceptedEvent(job_id=job.job_id))
115116

116-
d = self._runner.crawl(
117-
spider_cls_name,
118-
runtime_thread=self,
119-
job=job,
120-
)
117+
d = self._runner.crawl(spider_cls_name, runtime_thread=self, job=job)
121118
d.addCallback(lambda _: self._on_crawl_finished(job))
122119
d.addErrback(lambda f: self._on_crawl_error(job, f))
123120

@@ -126,7 +123,7 @@ def _on_crawl_finished(self, job):
126123
error = getattr(job, "runtime_error", None)
127124
stage = "idle" if success else "error"
128125
self.state.update(stage=stage, active_job_id=None, progress=0.0, error=error)
129-
self.event_q.put(JobFinishedEvent(job_id=job.job_id, success=success))
126+
self.event_q.put(JobFinishedEvent(job_id=job.job_id, success=success, error=error))
130127
if success:
131128
logger.info(f"Job {job.job_id} finished")
132129
else:
@@ -136,7 +133,7 @@ def _on_crawl_error(self, job, failure):
136133
error_msg = str(failure.value) if hasattr(failure, 'value') else str(failure)
137134
self.state.update(stage="error", active_job_id=None, progress=0.0, error=error_msg)
138135
self.event_q.put(ErrorEvent(job_id=job.job_id, error=error_msg))
139-
self.event_q.put(JobFinishedEvent(job_id=job.job_id, success=False))
136+
self.event_q.put(JobFinishedEvent(job_id=job.job_id, success=False, error=error_msg))
140137
logger.error(f"Job {job.job_id} failed: {error_msg}")
141138

142139
def submit_job(self, job: SpiderDownloadJob):

ComicSpider/spiders/basecomicspider.py

Lines changed: 21 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -8,20 +8,22 @@
88

99
import scrapy
1010

11-
from variables import *
11+
from variables import * # noqa: F403
1212
from assets import res as ori_res
1313
from ComicSpider.items import ComicspiderItem
1414
from ComicSpider.runtime.job_models import create_job_context, iter_download_items
1515
from GUI.core.font import font_color
1616
from utils import PresetHtmlEl, temp_p, conf
17-
from utils.processed_class import TextBrowserState, ProcessState, Url
17+
from utils.processed_class import TextBrowserState, ProcessState
1818

19-
from utils.protocol import SpiderDownloadJob, JobContext, LogEvent, ProcessStateEvent, TasksObjEvent
19+
from utils.protocol import SpiderDownloadJob, JobContext, LogEvent, ProcessStateEvent
2020
from utils.website import (
21-
correct_domain,
22-
InfoMinix, BookInfo, Episode
21+
correct_domain, BookInfo, Episode
22+
)
23+
from utils.website.registry import (
24+
resolve_provider_descriptor_by_spider,
25+
create_spider_site_runtime,
2326
)
24-
from utils.website.registry import resolve_spider_adapter
2527
from utils.website.schema import BodyFormat
2628
from utils.sql import SqlRecorder, SqlrV
2729
from utils.meta import MetaRecorder
@@ -71,8 +73,8 @@ class BaseComicSpider(scrapy.Spider):
7173
text_browser_state = TextBrowserState(text='')
7274
process_state = ProcessState(process='init')
7375
say: SayToGui = None
74-
adapter = None
75-
site = None
76+
provider_descriptor = None
77+
spider_site_runtime = None
7678
record_sql: SqlRecorder = None
7779
rv_sql: SqlrV = None
7880
ua = {}
@@ -96,6 +98,7 @@ class BaseComicSpider(scrapy.Spider):
9698
turn_page_search: str = None
9799
turn_page_info: tuple = None
98100
_enable_episode_dispatch = False
101+
remove_domain_cache_on_finished_miss = True
99102

100103
def preready(self):
101104
...
@@ -152,10 +155,7 @@ def _bind_runtime_context(self, job: SpiderDownloadJob):
152155
getattr(getattr(item, "from_book", None), "preview_url", None),
153156
])
154157
elif isinstance(item, BookInfo):
155-
candidates.extend([
156-
getattr(item, "url", None),
157-
getattr(item, "preview_url", None),
158-
])
158+
candidates.extend([getattr(item, "url", None), getattr(item, "preview_url", None)])
159159
for candidate in candidates:
160160
if origin := self._url_origin(candidate):
161161
self._runtime_origin = origin
@@ -275,12 +275,7 @@ def parse_section(self, response):
275275
if isinstance(url_or_ep, Episode):
276276
yield from self._process_episode(url_or_ep)
277277
elif isinstance(url_or_ep, str):
278-
yield scrapy.Request(
279-
url=url_or_ep,
280-
callback=self.parse_fin_page,
281-
meta={'book': book, 'page': page},
282-
dont_filter=True,
283-
)
278+
yield scrapy.Request(url=url_or_ep, callback=self.parse_fin_page, meta={'book': book, 'page': page}, dont_filter=True)
284279

285280
def need_sec_next_page(self, resp):
286281
pass
@@ -353,8 +348,8 @@ def from_crawler(cls, crawler, *args, **kwargs):
353348

354349
spider.record_sql = SqlRecorder()
355350
spider.rv_sql = SqlrV(1 if spider.name in spider.settings.get('SPECIAL') else 0).connect()
356-
spider.adapter = resolve_spider_adapter(spider.name)
357-
spider.site = spider.adapter.create_session(conf)
351+
spider.provider_descriptor = resolve_provider_descriptor_by_spider(spider.name)
352+
spider.spider_site_runtime = create_spider_site_runtime(spider.name, conf_state=conf)
358353
spider.mr = MetaRecorder(conf)
359354

360355
if job:
@@ -373,7 +368,7 @@ def _remove_cache(self):
373368
os.remove(domain_cache)
374369

375370
def _finish_counters(self, stats):
376-
downloaded_count = stats.get_value('image/downloaded', 0)
371+
downloaded_count = stats.get_value('file_status_count/downloaded', 0)
377372
uptodate_count = stats.get_value('file_status_count/uptodate', 0)
378373
total = self.job_context.total if self.job_context else self.total
379374
return downloaded_count, uptodate_count, downloaded_count + uptodate_count, total
@@ -412,11 +407,13 @@ def _handle_finished_status(self, stats):
412407
return
413408
downloaded_count, uptodate_count, processed_count, total = self._finish_counters(stats)
414409
exception_count = stats.get_value('process_exception/count', 0)
410+
remove_domain_cache = bool(self.remove_domain_cache_on_finished_miss)
415411
if total and processed_count < total:
416412
missing_count = total - processed_count
417413
self.say(font_color(f'miss: new[{downloaded_count}], cache[{uptodate_count}], miss[{missing_count}]<br>',
418414
cls='theme-err', size=3))
419-
self._remove_cache()
415+
if remove_domain_cache:
416+
self._remove_cache()
420417
elif total != 0 and processed_count > 0:
421418
if downloaded_count:
422419
_str = f'{self.res.finished_success % downloaded_count}'
@@ -428,7 +425,8 @@ def _handle_finished_status(self, stats):
428425
self.say(font_color(
429426
f'<br>{self.res.finished_err % last_exception}<br>log path/日志文件地址: [{self.settings.get("LOG_FILE")}]',
430427
cls='theme-err', size=3))
431-
self._remove_cache()
428+
if remove_domain_cache:
429+
self._remove_cache()
432430
else:
433431
self.say(font_color(f'{self.res.finished_empty}<br>', cls='theme-highlight', size=4))
434432

ComicSpider/spiders/ehentai.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ def frame_book(self, response):
4848
frame_results = {}
4949
targets = response.xpath('//table[contains(@class, "itg")]//td[contains(@class, "glcat")]/..')
5050
with ThreadPoolExecutor() as executor:
51-
books = list(executor.map(self.site.parser.parse_search_item, targets))
51+
books = list(executor.map(self.spider_site_runtime.parser.parse_search_item, targets))
5252
for x, book in enumerate(books):
5353
book.idx = x + 1
5454
frame_results[book.idx] = book

ComicSpider/spiders/h_comic.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,14 +26,14 @@ def ua(self):
2626
return HComicUtils.headers
2727

2828
def frame_section(self, response):
29-
book = self.site.parser.parse_book(response.text)
29+
book = self.spider_site_runtime.parser.parse_book(response.text)
3030
pages = int(book.pages or 0)
3131
if pages <= 0:
3232
self.say(font_color("未解析到页面信息,请稍后重试", cls="theme-err"))
3333
return {}
3434
media_id = getattr(book, "media_id", "")
3535
comic_source = getattr(book, "comic_source", "")
36-
image_prefix = self.site.parser.get_image_prefix(comic_source)
36+
image_prefix = self.spider_site_runtime.parser.get_image_prefix(comic_source)
3737
frame_results = {}
3838
for page in range(1, pages + 1):
3939
frame_results[page] = f"{image_prefix}/{media_id}/pages/{page}"

ComicSpider/spiders/hitomi.py

Lines changed: 16 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,15 @@
11
# -*- coding: utf-8 -*-
2-
import json
32
import asyncio
43
from concurrent.futures import ThreadPoolExecutor
54
import scrapy
65

76
from ComicSpider.runtime.job_models import iter_download_items
87

9-
from utils import PresetHtmlEl, conf
8+
from utils import conf
109
from utils.website import HitomiUtils, get_loop
11-
from utils.processed_class import PreviewHtml
1210
from ComicSpider.items import ComicspiderItem
1311

14-
from .basecomicspider import BaseComicSpider, font_color
12+
from .basecomicspider import BaseComicSpider
1513

1614
domain = HitomiUtils.index
1715

@@ -35,7 +33,7 @@ class HitomiSpider(BaseComicSpider):
3533
def from_crawler(cls, crawler, *args, **kwargs):
3634
spider = super(HitomiSpider, cls).from_crawler(crawler, *args, **kwargs)
3735
try:
38-
spider.async_cli = spider.site.get_cli(conf, is_async=True)
36+
spider.async_cli = spider.spider_site_runtime.provider.reqer_cls.get_cli(conf, is_async=True)
3937
except Exception as e:
4038
if spider.crawler and spider.crawler.engine:
4139
spider.crawler.engine.close_spider(spider, reason=f"[error]{str(e)}")
@@ -47,7 +45,8 @@ def from_crawler(cls, crawler, *args, **kwargs):
4745
def _get_nozomi_sync(self, nozomi_url, page):
4846
"""同步包装的异步nozomi获取方法"""
4947
async def _async_get():
50-
headers = {**HitomiUtils.headers, "Range": self.site.runtime.get_range(page)}
48+
provider = self.spider_site_runtime.provider
49+
headers = {**provider.headers, "Range": provider.get_range(page)}
5150
return await self.async_cli.get(nozomi_url, headers=headers)
5251

5352
try:
@@ -65,7 +64,7 @@ def start_requests(self):
6564
# ==============================================
6665
def parse(self, response, meta):
6766
self._emit_process('parse')
68-
result = HitomiUtils.parse_nozomi(response.content)
67+
result = self.spider_site_runtime.provider.parse_nozomi(response.content)
6968

7069
meta = meta or {}
7170
meta['results'] = []
@@ -90,10 +89,7 @@ async def fetch_all():
9089

9190
# 整合actual_parse的功能
9291
for _, resp in sorted(resps, key=lambda x: x[0]): # 按原始索引排序
93-
meta['results'].append({
94-
"text": resp.text,
95-
"meta": {k: v for k, v in meta.items() if k != 'results'}
96-
})
92+
meta['results'].append({"text": resp.text, "meta": {k: v for k, v in meta.items() if k != 'results'}})
9793
yield from self.defer_parse(meta['results'])
9894

9995
def defer_parse(self, rets):
@@ -109,12 +105,17 @@ def parse_section(self, meta):
109105
this_uuid, this_md5 = book.id_and_md5()
110106
self._assert_task_not_downloaded(book)
111107
self.set_task(book)
108+
provider = self.spider_site_runtime.provider
109+
# Full-image URLs become invalid as soon as Hitomi rotates gg.b, even when the
110+
# cached bucket still looks "same-hour" by local heuristic. Refresh once per
111+
# download section before materializing image URLs.
112+
provider.refresh_gg_if_needed(force=True)
112113
for index, pic_info in enumerate(book.pics, 1):
113114
item = ComicspiderItem()
114115
item['title'] = book.name
115116
item['page'] = str(index)
116117
item['section'] = None
117-
img_url = self.site.runtime.get_img_url(pic_info['hash'], pic_info['hasavif'])
118+
img_url = provider.get_img_url(pic_info['hash'], pic_info['hasavif'])
118119
item['image_urls'] = [img_url]
119120
item['uuid'] = this_uuid
120121
item['uuid_md5'] = this_md5
@@ -138,8 +139,10 @@ def iter_download_requests(self, job):
138139
def frame_book(self, rets, meta):
139140
frame_results = {}
140141
texts = [target['text'] for target in rets]
142+
runtime_provider = self.spider_site_runtime.provider
143+
parser = runtime_provider.__class__.parser(runtime_provider)
141144
with ThreadPoolExecutor() as executor:
142-
books = list(executor.map(self.site.parser.parse_search_item, texts))
145+
books = list(executor.map(parser.parse_search_item, texts))
143146
for x, book in enumerate(books):
144147
book.idx = x + 1
145148
book.preview_url = f"{self.domain}{book.preview_url}"

0 commit comments

Comments
 (0)