diff --git a/doc/usage/configuration.rst b/doc/usage/configuration.rst index 6bb45d41840..b464186ad1b 100644 --- a/doc/usage/configuration.rst +++ b/doc/usage/configuration.rst @@ -3952,6 +3952,34 @@ and the number of workers to use. .. versionadded:: 1.1 +.. confval:: linkcheck_cache + :type: :code-py:`bool` + :default: :code-py:`False` + + Whether to cache the successful linkcheck results of each url check. + If a :confval:`cache file ` of a previous build is present, + the previous successful results will be re-used until its age exceeds the + configured :confval:`duration `. + + .. versionadded:: TBD + +.. confval:: linkcheck_cache_file + :type: :code-py:`str` + :default: :code-py:`linkcheck_cache.json` + + File where the to read and write the linkcheck cache states. + The path is relative to the builddir. + + .. versionadded:: TBD + +.. confval:: linkcheck_cache_duration + :type: :code-py:`float` + :default: :code-py:`7.0` + + The number of days to reuse a successful linkcheck result. + + .. versionadded:: TBD + Domain options ============== diff --git a/sphinx/builders/linkcheck.py b/sphinx/builders/linkcheck.py index 5889e75d126..0e15cb8a724 100644 --- a/sphinx/builders/linkcheck.py +++ b/sphinx/builders/linkcheck.py @@ -7,10 +7,11 @@ import re import socket import time +from datetime import UTC, datetime, timedelta from enum import StrEnum from html.parser import HTMLParser from queue import PriorityQueue, Queue -from threading import Thread +from threading import Lock, Thread from typing import TYPE_CHECKING, NamedTuple, cast from urllib.parse import quote, unquote, urlparse, urlsplit, urlunparse @@ -55,6 +56,7 @@ class _Status(StrEnum): TIMEOUT = 'timeout' UNCHECKED = 'unchecked' UNKNOWN = 'unknown' + CACHED = 'cached' WORKING = 'working' @@ -70,6 +72,8 @@ class _Status(StrEnum): QUEUE_POLL_SECS = 1 DEFAULT_DELAY = 60.0 +cache_file_lock = Lock() + @object.__new__ class _SENTINEL_LAR: @@ -94,7 +98,12 @@ def init(self) -> None: socket.setdefaulttimeout(5.0) def finish(self) -> None: - checker = HyperlinkAvailabilityChecker(self.config) + cache_file = ( + self.outdir / self.config.linkcheck_cache_file + if self.config.linkcheck_cache + else None + ) + checker = HyperlinkAvailabilityChecker(self.config, cache_file) logger.info('') output_text = self.outdir / 'output.txt' @@ -138,6 +147,8 @@ def process_result(self, result: CheckResult) -> None: logger.info(darkgray('-ignored- ') + msg) # NoQA: G003 case _Status.WORKING: logger.info(darkgreen('ok ') + f'{res_uri}{result.message}') # NoQA: G003 + case _Status.CACHED: + logger.info(darkgreen('cached ') + f'{res_uri} - {result.message}') # NoQA: G003 case _Status.TIMEOUT: if self.config.verbosity < 0: msg = 'timeout ' + f'{res_uri}{result.message}' @@ -295,7 +306,7 @@ class Hyperlink(NamedTuple): class HyperlinkAvailabilityChecker: - def __init__(self, config: Config) -> None: + def __init__(self, config: Config, cache_file: _StrPath | None) -> None: self.config = config self.rate_limits: dict[str, RateLimit] = {} self.rqueue: Queue[CheckResult] = Queue() @@ -306,6 +317,13 @@ def __init__(self, config: Config) -> None: self.to_ignore: list[re.Pattern[str]] = list( map(re.compile, self.config.linkcheck_ignore) ) + self.last_cache_result = {} + self.cache_file = cache_file + if self.cache_file and self.cache_file.exists(): + with self.cache_file.open('r') as f: + self.last_cache_result = json.load(f) + self.now = datetime.now(UTC) + self.cache_duration = timedelta(days=self.config.linkcheck_cache_duration) def check(self, hyperlinks: dict[str, Hyperlink]) -> Iterator[CheckResult]: self.invoke_threads() @@ -322,6 +340,25 @@ def check(self, hyperlinks: dict[str, Hyperlink]) -> Iterator[CheckResult]: code=0, ) else: + if ( + self.config.linkcheck_cache + and hyperlink.uri in self.last_cache_result + ): + last_succesfful_time = datetime.fromtimestamp( + self.last_cache_result[hyperlink.uri], UTC + ) + age = self.now - last_succesfful_time + if age < self.cache_duration: + # Cache is still valid + yield CheckResult( + uri=hyperlink.uri, + docname=hyperlink.docname, + lineno=hyperlink.lineno, + status=_Status.CACHED, + message=last_succesfful_time.strftime('%Y-%m-%d %H:%M'), + code=0, + ) + continue self.wqueue.put(CheckRequest(CHECK_IMMEDIATELY, hyperlink), False) total_links += 1 @@ -335,7 +372,7 @@ def check(self, hyperlinks: dict[str, Hyperlink]) -> Iterator[CheckResult]: def invoke_threads(self) -> None: for _i in range(self.num_workers): thread = HyperlinkAvailabilityCheckWorker( - self.config, self.rqueue, self.wqueue, self.rate_limits + self.config, self.rqueue, self.wqueue, self.rate_limits, self.cache_file ) thread.start() self.workers.append(thread) @@ -372,7 +409,9 @@ def __init__( rqueue: Queue[CheckResult], wqueue: Queue[CheckRequest], rate_limits: dict[str, RateLimit], + cache_file: _StrPath | None = None, ) -> None: + self.cache_file = cache_file self.rate_limits = rate_limits self.rqueue = rqueue self.wqueue = wqueue @@ -485,6 +524,23 @@ def _check(self, docname: str, uri: str, hyperlink: Hyperlink) -> _URIProperties if status != _Status.BROKEN: break + # Only cache succesfull results which actually ran the _check_uri + if self.cache_file and status == _Status.WORKING: + with cache_file_lock: + if self.cache_file.exists(): + with self.cache_file.open('r') as f: + cache_state = json.load(f) + if not isinstance(cache_state, dict): + logger.warning( + __('Previous linkcheck cache is malformed. Recreating it.') + ) + cache_state = {} + else: + cache_state = {} + cache_state[uri] = datetime.now(UTC).timestamp() + with self.cache_file.open('w') as f: + json.dump(cache_state, f) + return status, info, code def _retrieval_methods( @@ -844,6 +900,11 @@ def setup(app: Sphinx) -> ExtensionMetadata: '', types=frozenset({frozenset, list, set, tuple}), ) + app.add_config_value('linkcheck_cache', False, '', types=frozenset({bool})) + app.add_config_value( + 'linkcheck_cache_file', 'linkcheck_cache.json', '', types=frozenset({str}) + ) + app.add_config_value('linkcheck_cache_duration', 7.0, '', types=frozenset({float})) app.add_event('linkcheck-process-uri') diff --git a/tests/test_builders/test_build_linkcheck.py b/tests/test_builders/test_build_linkcheck.py index 7b036ec4506..797c4f6067e 100644 --- a/tests/test_builders/test_build_linkcheck.py +++ b/tests/test_builders/test_build_linkcheck.py @@ -410,6 +410,90 @@ def test_anchors_ignored_for_url(app: SphinxTestApp) -> None: } +@pytest.mark.sphinx( + 'linkcheck', + testroot='linkcheck', + freshenv=True, +) +def test_cache(app: SphinxTestApp) -> None: + app.config.linkcheck_cache = True + + class InternalServerErrorHandler(BaseHTTPRequestHandler): + protocol_version = 'HTTP/1.1' + + def do_GET(self) -> None: + self.send_error(500, 'Internal Server Error') + + # First run doing caching + with serve_application(app, OKHandler) as address: + app.build() + + assert (app.outdir / 'output.json').exists() + output_content = (app.outdir / 'output.json').read_text(encoding='utf8') + + rows = [json.loads(x) for x in output_content.splitlines()] + assert len(rows) == 10 + rowsby = {row['uri']: row for row in rows} + assert rowsby[f'http://{address}/']['status'] == 'working' + assert rowsby[f'http://{address}/#!bar']['status'] == 'working' + assert rowsby[f'http://{address}/image.png']['status'] == 'working' + assert rowsby[f'http://{address}/image2.png']['status'] == 'working' + assert rowsby['conf.py']['status'] == 'working' + assert rowsby['path/to/notfound']['status'] == 'broken' + assert rowsby[f'http://{address}/#top']['status'] == 'broken' + + assert (app.outdir / app.config.linkcheck_cache_file).exists() + with (app.outdir / app.config.linkcheck_cache_file).open('r') as f: + cache_initial = json.load(f) + assert len(cache_initial) == 5 + assert f'http://{address}/' in cache_initial + assert f'http://{address}/#!bar' in cache_initial + assert f'http://{address}/image.png' in cache_initial + assert 'conf.py' not in cache_initial # because it does not use http + assert f'http://{address}/#top' not in cache_initial # because it was broken + + # Second run with cached values + # Manually expire a cache item + cache_initial[f'http://{address}/image2.png'] = 0.0 + with (app.outdir / app.config.linkcheck_cache_file).open('w') as f: + json.dump(cache_initial, f) + + with serve_application( + app, InternalServerErrorHandler, port=int(address.split(':')[1]) + ): + app.build() + + assert (app.outdir / 'output.json').exists() + output_content = (app.outdir / 'output.json').read_text(encoding='utf8') + + rows = [json.loads(x) for x in output_content.splitlines()] + assert len(rows) == 10 + rowsby = {row['uri']: row for row in rows} + assert rowsby[f'http://{address}/']['status'] == 'cached' + assert rowsby[f'http://{address}/#!bar']['status'] == 'cached' + assert rowsby[f'http://{address}/image.png']['status'] == 'cached' + assert ( + rowsby[f'http://{address}/image2.png']['status'] == 'broken' + ) # because cache expired + assert rowsby['conf.py']['status'] == 'working' + assert rowsby['path/to/notfound']['status'] == 'broken' + assert rowsby[f'http://{address}/#top']['status'] == 'broken' + + assert (app.outdir / app.config.linkcheck_cache_file).exists() + with (app.outdir / app.config.linkcheck_cache_file).open('r') as f: + cache_after = json.load(f) + assert len(cache_after) == 5 + assert f'http://{address}/' in cache_after + assert f'http://{address}/#!bar' in cache_after + assert f'http://{address}/image.png' in cache_after + assert f'http://{address}/image2.png' in cache_after + assert cache_after[f'http://{address}/image2.png'] == 0.0 + assert 'conf.py' not in cache_after + assert f'http://{address}/#top' not in cache_after + + assert all(cache_initial[uri] == cache_after[uri] for uri in cache_initial) + + @pytest.mark.sphinx( 'linkcheck', testroot='linkcheck-localserver-anchor',