Commit 9d7c674b authored by Matthieu Boileau's avatar Matthieu Boileau

Fix a major bug in references

parent b368c284
......@@ -21,6 +21,7 @@ yaml = YAML(typ='safe')
SKIP_REASON = {"skip_rub": "belonging to a skipped rubrique",
"empty": "empty content",
"unpub": "not published"}
SHORTEN = {'article': 'art', 'rubrique': 'rub', 'breve': 'brev', 'message': 'mess'}
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
......@@ -157,13 +158,13 @@ def link(s, website):
text = matchobj.group(1).strip()
url = matchobj.group(2).strip()
def nullify_url(reason, id_art, text, url):
def nullify_url(id_art, text, url):
"""Throw WARNING message and return empty URL"""
msg = f" WARNING: nullify link to {reason} {id_art}\n"
msg = f" WARNING: nullify link to non existing article {id_art}\n"
msg += f" text: {text}\n"
msg += f" url: {url}"
logger.warning(msg)
website.nullified_urls[reason] += 1
website.nullified_urls += 1
return ""
# Remove "mailto:" prefix from URL
......@@ -179,31 +180,20 @@ def link(s, website):
new_url = url
else:
doc_url = re.match(r"\Adoc([0-9]+)", url)
art_url = re.match(r"\A(art|rub|brev)([0-9]+)", url)
if doc_url:
# [text->doc#]
new_url = os.path.join(website.attachments_prefix, "IMG", website.doc_index[int(doc_url.group(1))])
elif art_url:
# [text->art#,rub#,brev#]
art_id = art_url.group(0)
try:
new_url = "{filename}/" + website.article_index[art_id]
except KeyError:
new_url = nullify_url(art_id, text, url)
else:
art_url = re.match(r"\Aart([0-9]+)", url)
if art_url:
# [text->art#]
id_art = int(art_url.group(1))
try:
new_url = "{filename}/" + website.article_index[id_art]
except KeyError:
new_url = nullify_url("non existing article", id_art, text, url)
else:
rub_url = re.match(r"\Arub([0-9]+)", url)
if rub_url:
# [text->rub#]
id_rub = int(rub_url.group(1))
category = website.category_index[id_rub]
if category == 'skip':
new_url = nullify_url("skipped rubrique", id_rub, text, url)
else:
new_url = os.path.join(f"{category}.html")
else:
# [text->path_to_file]
new_url = os.path.join(website.attachments_prefix, url)
# [text->path_to_file]
new_url = os.path.join(website.attachments_prefix, url)
new_link = f"[{text}]({new_url})"
return new_link
......@@ -306,12 +296,13 @@ class Article:
self.category = self.website.category_index[spip_article['id_rubrique']]
id_tag = 'id_' + self.type
self.id = spip_article[id_tag]
self.short_id = spip_article[id_tag]
self.id = f"{SHORTEN[self.type]}{self.short_id}"
self.title = spip_article['titre']
if self.category == 'skip':
self.skip_reason = SKIP_REASON["skip_rub"]
elif not spip_article['texte']:
elif not spip_article['texte'] and not self.type == 'rubrique':
self.skip_reason = SKIP_REASON["empty"]
elif spip_article['statut'] != 'publie':
self.skip_reason = SKIP_REASON["unpub"]
......@@ -320,7 +311,7 @@ class Article:
if not self.skip_reason:
self.mdprefix = f"spip_{self.type}-{self.id}"
self.mdprefix = f"spip_{self.type}-{self.short_id}"
self.mdpath = os.path.join(self.category, self.mdprefix + ".md")
try:
self.date = spip_article['date']
......@@ -328,11 +319,11 @@ class Article:
self.date = spip_article['date_heure']
self.modified = spip_article.get('date_modif', self.date)
self.summary = spip_article.get('descriptif', '')
self.summary = spip_article.get('descriptif', None)
self.text = spip_article['texte']
try:
self.authors = self.website.author_index[self.type][self.id]
self.authors = self.website.author_index[self.id]
except KeyError:
self.authors = self.website.default_author
......@@ -353,7 +344,7 @@ class Article:
This is the content of my super blog post.
"""
logger.info(f"{self.type}_{self.id}: {self.title}")
logger.info(f"{self.type}_{self.short_id}: {self.title}")
if self.skip_reason:
logger.warning(f" WARNING: skipping because {self.skip_reason}")
......@@ -394,11 +385,6 @@ class Website:
AUTHORS_FILENAME = "spip_auteurs_clean.yml"
AUTHORS_LINKS_FILENAME = "spip_auteurs_liens.yml"
class MissingKeyDict(dict):
"""A dictionnary that returns 0 value if key is missing"""
def __missing__(self, key):
return 0
@staticmethod
def _load_and_clean_yaml(filename):
"""Load yaml file filename, clean it and return a dictionary"""
......@@ -411,7 +397,7 @@ class Website:
self.doc_index = {}
self.author_index = {}
self.nullified_urls = self.MissingKeyDict()
self.nullified_urls = 0
self.articles = []
config_filename = "config.yml"
......@@ -492,11 +478,10 @@ class Website:
for authors_link in authors_links:
spip_type = authors_link['objet']
art_id = authors_link['id_objet']
# art_id = art23, rub45, brev98, etc.
art_id = f"{SHORTEN[spip_type]}{(authors_link['id_objet'])}"
author_id = authors_link['id_auteur']
if spip_type not in self.author_index.keys():
self.author_index[spip_type] = {}
self.author_index[spip_type][art_id] = author_name_index[author_id]
self.author_index[art_id] = author_name_index[author_id]
def _build_articles(self):
"""
......@@ -542,8 +527,7 @@ class Website:
logger.info(f" {processed.count('')} converted articles")
for k in SKIP_REASON:
logger.warning(f" {processed.count(SKIP_REASON[k])} skipped articles because {SKIP_REASON[k]}")
nullified = sum([value for value in self.nullified_urls.values()])
logger.warning(f" {nullified} nullified URLs {self.nullified_urls}")
logger.warning(f" {self.nullified_urls} nullified URLs")
logger.info("-------")
logger.debug("See ouput.log")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment