Commit 48c4ebdd authored by Benoit Fabrèges's avatar Benoit Fabrèges
Browse files

adding plugin based on the already existing one to check dead links

parent c5bb2ef2
Pipeline #5211 failed with stages
in 4 seconds
......@@ -49,7 +49,7 @@ MARKUP = ('md', 'ipynb')
PLUGIN_PATHS = ['./plugins']
PLUGINS = ['ipynb.markup', 'pelican_dynamic', 'rst_directives', 'extract_toc', 'calcul_reader', 'calcul_filters',
'rst_include', 'sitemap', 'pelican-deadlinks', 'tipue_search']
'rst_include', 'sitemap', 'pelican-deadlinks', 'tipue_search', 'check_deadlinks']
# Useful for 'rst_include' plugin (elative to content directory)
RST_GLOBAL_INCLUDES = ['../plugins/rst_include/include.rst']
......
from pelican import signals
from bs4 import BeautifulSoup
import requests
from requests.exceptions import Timeout, RequestException
import os
from pathlib import Path
import logging
def check_file(filename):
"""
check if file exist in output
"""
code = None
avail = filename.exists()
return avail, avail, code
def check_url(url, timeout_ms):
"""
Open connection to the given url and check status code.
:param url: URL of the website to be checked
:timeout_mx: timeout duration in ms
:return: (availibility, success, HTTP code)
"""
availibility, success, code = (False, False, None)
timeout_s = timeout_ms * 1e-3
try:
r = requests.get(url, timeout=timeout_s, verify=True)
code = r.status_code
availibility = True
success = code == requests.codes.ok
except Timeout:
availibility = False
success = None
except RequestException:
availibility = None
success = False
return availibility, success, code
def register():
"""
Register deadlink signal
"""
signals.all_generators_finalized.connect(all_generators_finalized)
def all_generators_finalized(generators):
"""
Pelican callback
"""
timeout_duration_ms = 8000
settings = generators[-1].settings
output_path = Path(settings.get('OUTPUT_PATH'))
#all_pages = generators[-1].context['articles'] + \
# generators[-1].context['pages'] + \
# generators[-1].context['hidden_pages']
all_pages = generators[-1].context['pages']
cache = {}
tags = {'a' : 'href', 'img' : 'src'}
for page in all_pages:
soup_doc = BeautifulSoup(page.content, 'html.parser')
for tag, attr in tags.items():
for anchor in soup_doc(tag):
if attr not in anchor.attrs:
continue
url = anchor[attr]
# Skipping emails
if url.startswith('mailto'):
continue
# TODO: check links to ids
if '#' in url:
url, idname = tuple(url.split('#'))
if not url:
continue
# We do not check twice for the same link
if url in cache:
avail, success, code = cache[url]
else:
if url.startswith('http'):
avail, success, code = check_url(url, timeout_duration_ms)
else:
if url.startswith('/'):
internal_filename = output_path.joinpath(*(url.split(os.sep)))
else:
internal_filename = output_path.joinpath(Path(page.url).parent, url)
avail, success, code = check_file(internal_filename.resolve())
cache[url] = (avail, success, code)
if not avail:
logging.warning(f'Check link (not available): {url}\nIn file {page.url}')
elif not success:
logging.error(f'Dead link (error code {code}): {url}\nIn file {page.url}')
else:
logging.debug(f'Good link: {url}')
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment