Commit ce5229c8 authored by Benoit Fabrèges's avatar Benoit Fabrèges
Browse files

reorder dead links by types

parent a4c3de3b
Pipeline #6064 failed with stages
in 2 minutes and 24 seconds
......@@ -7,15 +7,26 @@ from pathlib import Path
import logging
error_codes = {0 : "Good",
1 : "Timeout error",
2 : "Certificate validation error - check them with https://www.ssllabs.com/ssltest/analyze.html",
3 : "Connection error",
4 : "HTTP error code received (4XX or 5XX)",
5 : "File missing"}
def check_file(filename):
"""
check if file exist in output
"""
code = None
avail = filename.exists()
msg = "File doesn't exist"
return avail, code, msg
if avail:
error_code = 0
else:
error_code = 5
return error_code
def check_url(url, timeout_ms):
"""
......@@ -25,8 +36,7 @@ def check_url(url, timeout_ms):
:timeout_mx: timeout duration in ms
:return: (success, HTTP code, message)
"""
success, code = False, None
msg = ""
error_code = 0
timeout_s = timeout_ms * 1e-3
try:
......@@ -35,23 +45,20 @@ def check_url(url, timeout_ms):
# - False: bypasses certificate validation completely
r = requests.get(url, timeout=timeout_s, verify=True)
except Timeout:
msg = "Timeout error"
error_code = 1
except SSLError:
msg = "Certificate validation error - check it with https://www.ssllabs.com/ssltest/analyze.html"
error_code = 2
except ConnectionError:
msg = "Connection error"
error_code = 3
else:
code = r.status_code
http_code = r.status_code
try:
r.raise_for_status()
except HTTPError:
msg = "Error code received"
else:
success = True
code == requests.codes.ok
error_code = 4
return success, code, msg
return error_code
......@@ -79,7 +86,10 @@ def all_generators_finalized(generators):
cache = {}
tags = {'a' : 'href', 'img' : 'src', 'link' : 'href'}
for page in all_pages:
num_pages = len(all_pages)
for count_pages, page in enumerate(all_pages):
print(f"{count_pages} / {num_pages}", end="\r")
soup_doc = BeautifulSoup(page.content, 'html.parser')
for tag, attr in tags.items():
......@@ -101,26 +111,38 @@ def all_generators_finalized(generators):
# We do not check twice for the same link
if url in cache:
success, code, msg = cache[url]
if cache[url][0] > 0:
cache[url][1].add((page.url, page.get_relative_source_path()))
else:
if url.startswith('http'):
success, code, msg = check_url(url, timeout_duration_ms)
error_code = check_url(url, timeout_duration_ms)
else:
if url.startswith('/'):
internal_filename = output_path.joinpath(*(url.split(os.sep)))
else:
internal_filename = output_path.joinpath(Path(page.url).parent, url)
success, code, msg = check_file(internal_filename.resolve())
error_code = check_file(internal_filename.resolve())
cache[url] = (success, code, msg)
if not success:
if code is not None:
logging.error(f'Dead link (error code {code}): {url}\nIn file {page.url}\nFrom source file {page.get_relative_source_path()}')
else:
logging.warning(f'Check link ({msg}): {url}\nIn file {page.url}\nFrom source file {page.get_relative_source_path()}')
else:
logging.debug(f'Good link: {url}')
cache[url] = (error_code, {(page.url, page.get_relative_source_path())})
# prints the results
results_by_code = {}
for url, req in cache.items():
if req[0] != 0:
try:
results_by_code[req[0]].append((url, req[1]))
except KeyError:
results_by_code[req[0]] = [(url, req[1])]
print()
for error_code, res in results_by_code.items():
msg = f"Dead links -- {error_codes[error_code]}\n"
for url in res:
msg += f"Link: {url[0]}\nIn file(s):\n"
for files in url[1]:
msg += f"\t{files[0]} -- from source file {files[1]}\n"
msg += "\n"
logging.error(msg)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment