Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions doc/usage/configuration.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3952,6 +3952,34 @@ and the number of workers to use.

.. versionadded:: 1.1

.. confval:: linkcheck_cache
:type: :code-py:`bool`
:default: :code-py:`False`

Whether to cache the successful linkcheck results of each url check.
If a :confval:`cache file <linkcheck_cache_file>` of a previous build is present,
the previous successful results will be re-used until its age exceeds the
configured :confval:`duration <linkcheck_cache_duration>`.

.. versionadded:: TBD

.. confval:: linkcheck_cache_file
:type: :code-py:`str`
:default: :code-py:`linkcheck_cache.json`

File where the to read and write the linkcheck cache states.
The path is relative to the builddir.

.. versionadded:: TBD

.. confval:: linkcheck_cache_duration
:type: :code-py:`float`
:default: :code-py:`7.0`

The number of days to reuse a successful linkcheck result.

.. versionadded:: TBD


Domain options
==============
Expand Down
69 changes: 65 additions & 4 deletions sphinx/builders/linkcheck.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,11 @@
import re
import socket
import time
from datetime import UTC, datetime, timedelta
from enum import StrEnum
from html.parser import HTMLParser
from queue import PriorityQueue, Queue
from threading import Thread
from threading import Lock, Thread
from typing import TYPE_CHECKING, NamedTuple, cast
from urllib.parse import quote, unquote, urlparse, urlsplit, urlunparse

Expand Down Expand Up @@ -55,6 +56,7 @@ class _Status(StrEnum):
TIMEOUT = 'timeout'
UNCHECKED = 'unchecked'
UNKNOWN = 'unknown'
CACHED = 'cached'
WORKING = 'working'


Expand All @@ -70,6 +72,8 @@ class _Status(StrEnum):
QUEUE_POLL_SECS = 1
DEFAULT_DELAY = 60.0

cache_file_lock = Lock()


@object.__new__
class _SENTINEL_LAR:
Expand All @@ -94,7 +98,12 @@ def init(self) -> None:
socket.setdefaulttimeout(5.0)

def finish(self) -> None:
checker = HyperlinkAvailabilityChecker(self.config)
cache_file = (
self.outdir / self.config.linkcheck_cache_file
if self.config.linkcheck_cache
else None
)
checker = HyperlinkAvailabilityChecker(self.config, cache_file)
logger.info('')

output_text = self.outdir / 'output.txt'
Expand Down Expand Up @@ -138,6 +147,8 @@ def process_result(self, result: CheckResult) -> None:
logger.info(darkgray('-ignored- ') + msg) # NoQA: G003
case _Status.WORKING:
logger.info(darkgreen('ok ') + f'{res_uri}{result.message}') # NoQA: G003
case _Status.CACHED:
logger.info(darkgreen('cached ') + f'{res_uri} - {result.message}') # NoQA: G003
case _Status.TIMEOUT:
if self.config.verbosity < 0:
msg = 'timeout ' + f'{res_uri}{result.message}'
Expand Down Expand Up @@ -295,7 +306,7 @@ class Hyperlink(NamedTuple):


class HyperlinkAvailabilityChecker:
def __init__(self, config: Config) -> None:
def __init__(self, config: Config, cache_file: _StrPath | None) -> None:
self.config = config
self.rate_limits: dict[str, RateLimit] = {}
self.rqueue: Queue[CheckResult] = Queue()
Expand All @@ -306,6 +317,13 @@ def __init__(self, config: Config) -> None:
self.to_ignore: list[re.Pattern[str]] = list(
map(re.compile, self.config.linkcheck_ignore)
)
self.last_cache_result = {}
self.cache_file = cache_file
if self.cache_file and self.cache_file.exists():
with self.cache_file.open('r') as f:
self.last_cache_result = json.load(f)
self.now = datetime.now(UTC)
self.cache_duration = timedelta(days=self.config.linkcheck_cache_duration)

def check(self, hyperlinks: dict[str, Hyperlink]) -> Iterator[CheckResult]:
self.invoke_threads()
Expand All @@ -322,6 +340,25 @@ def check(self, hyperlinks: dict[str, Hyperlink]) -> Iterator[CheckResult]:
code=0,
)
else:
if (
self.config.linkcheck_cache
and hyperlink.uri in self.last_cache_result
):
last_succesfful_time = datetime.fromtimestamp(
self.last_cache_result[hyperlink.uri], UTC
)
age = self.now - last_succesfful_time
if age < self.cache_duration:
# Cache is still valid
yield CheckResult(
uri=hyperlink.uri,
docname=hyperlink.docname,
lineno=hyperlink.lineno,
status=_Status.CACHED,
message=last_succesfful_time.strftime('%Y-%m-%d %H:%M'),
code=0,
)
continue
self.wqueue.put(CheckRequest(CHECK_IMMEDIATELY, hyperlink), False)
total_links += 1

Expand All @@ -335,7 +372,7 @@ def check(self, hyperlinks: dict[str, Hyperlink]) -> Iterator[CheckResult]:
def invoke_threads(self) -> None:
for _i in range(self.num_workers):
thread = HyperlinkAvailabilityCheckWorker(
self.config, self.rqueue, self.wqueue, self.rate_limits
self.config, self.rqueue, self.wqueue, self.rate_limits, self.cache_file
)
thread.start()
self.workers.append(thread)
Expand Down Expand Up @@ -372,7 +409,9 @@ def __init__(
rqueue: Queue[CheckResult],
wqueue: Queue[CheckRequest],
rate_limits: dict[str, RateLimit],
cache_file: _StrPath | None = None,
) -> None:
self.cache_file = cache_file
self.rate_limits = rate_limits
self.rqueue = rqueue
self.wqueue = wqueue
Expand Down Expand Up @@ -485,6 +524,23 @@ def _check(self, docname: str, uri: str, hyperlink: Hyperlink) -> _URIProperties
if status != _Status.BROKEN:
break

# Only cache succesfull results which actually ran the _check_uri
if self.cache_file and status == _Status.WORKING:
with cache_file_lock:
if self.cache_file.exists():
with self.cache_file.open('r') as f:
cache_state = json.load(f)
if not isinstance(cache_state, dict):
logger.warning(
__('Previous linkcheck cache is malformed. Recreating it.')
)
cache_state = {}
else:
cache_state = {}
cache_state[uri] = datetime.now(UTC).timestamp()
with self.cache_file.open('w') as f:
json.dump(cache_state, f)

return status, info, code

def _retrieval_methods(
Expand Down Expand Up @@ -844,6 +900,11 @@ def setup(app: Sphinx) -> ExtensionMetadata:
'',
types=frozenset({frozenset, list, set, tuple}),
)
app.add_config_value('linkcheck_cache', False, '', types=frozenset({bool}))
app.add_config_value(
'linkcheck_cache_file', 'linkcheck_cache.json', '', types=frozenset({str})
)
app.add_config_value('linkcheck_cache_duration', 7.0, '', types=frozenset({float}))

app.add_event('linkcheck-process-uri')

Expand Down
84 changes: 84 additions & 0 deletions tests/test_builders/test_build_linkcheck.py
Original file line number Diff line number Diff line change
Expand Up @@ -410,6 +410,90 @@ def test_anchors_ignored_for_url(app: SphinxTestApp) -> None:
}


@pytest.mark.sphinx(
'linkcheck',
testroot='linkcheck',
freshenv=True,
)
def test_cache(app: SphinxTestApp) -> None:
app.config.linkcheck_cache = True

class InternalServerErrorHandler(BaseHTTPRequestHandler):
protocol_version = 'HTTP/1.1'

def do_GET(self) -> None:
self.send_error(500, 'Internal Server Error')

# First run doing caching
with serve_application(app, OKHandler) as address:
app.build()

assert (app.outdir / 'output.json').exists()
output_content = (app.outdir / 'output.json').read_text(encoding='utf8')

rows = [json.loads(x) for x in output_content.splitlines()]
assert len(rows) == 10
rowsby = {row['uri']: row for row in rows}
assert rowsby[f'http://{address}/']['status'] == 'working'
assert rowsby[f'http://{address}/#!bar']['status'] == 'working'
assert rowsby[f'http://{address}/image.png']['status'] == 'working'
assert rowsby[f'http://{address}/image2.png']['status'] == 'working'
assert rowsby['conf.py']['status'] == 'working'
assert rowsby['path/to/notfound']['status'] == 'broken'
assert rowsby[f'http://{address}/#top']['status'] == 'broken'

assert (app.outdir / app.config.linkcheck_cache_file).exists()
with (app.outdir / app.config.linkcheck_cache_file).open('r') as f:
cache_initial = json.load(f)
assert len(cache_initial) == 5
assert f'http://{address}/' in cache_initial
assert f'http://{address}/#!bar' in cache_initial
assert f'http://{address}/image.png' in cache_initial
assert 'conf.py' not in cache_initial # because it does not use http
assert f'http://{address}/#top' not in cache_initial # because it was broken

# Second run with cached values
# Manually expire a cache item
cache_initial[f'http://{address}/image2.png'] = 0.0
with (app.outdir / app.config.linkcheck_cache_file).open('w') as f:
json.dump(cache_initial, f)

with serve_application(
app, InternalServerErrorHandler, port=int(address.split(':')[1])
):
app.build()

assert (app.outdir / 'output.json').exists()
output_content = (app.outdir / 'output.json').read_text(encoding='utf8')

rows = [json.loads(x) for x in output_content.splitlines()]
assert len(rows) == 10
rowsby = {row['uri']: row for row in rows}
assert rowsby[f'http://{address}/']['status'] == 'cached'
assert rowsby[f'http://{address}/#!bar']['status'] == 'cached'
assert rowsby[f'http://{address}/image.png']['status'] == 'cached'
assert (
rowsby[f'http://{address}/image2.png']['status'] == 'broken'
) # because cache expired
assert rowsby['conf.py']['status'] == 'working'
assert rowsby['path/to/notfound']['status'] == 'broken'
assert rowsby[f'http://{address}/#top']['status'] == 'broken'

assert (app.outdir / app.config.linkcheck_cache_file).exists()
with (app.outdir / app.config.linkcheck_cache_file).open('r') as f:
cache_after = json.load(f)
assert len(cache_after) == 5
assert f'http://{address}/' in cache_after
assert f'http://{address}/#!bar' in cache_after
assert f'http://{address}/image.png' in cache_after
assert f'http://{address}/image2.png' in cache_after
assert cache_after[f'http://{address}/image2.png'] == 0.0
assert 'conf.py' not in cache_after
assert f'http://{address}/#top' not in cache_after

assert all(cache_initial[uri] == cache_after[uri] for uri in cache_initial)


@pytest.mark.sphinx(
'linkcheck',
testroot='linkcheck-localserver-anchor',
Expand Down
Loading