Commit 9818f697 authored by Matthieu Boileau's avatar Matthieu Boileau
Browse files

Read parameters from config.yml (Fix #11)

parent 60edf76e
......@@ -33,6 +33,9 @@ Ce script produit le fichier `spip_articles_clean.yml`.
### 4. Convertir les articles spip en articles pelican
- Editer le fichier `config.yml`
- Lancer la conversion:
```
python3 spip2pelican.py
```
......
site_url: http://calcul.math.cnrs.fr
authors: Groupe Calcul
attachments_prefix: attachments/spip/
categories:
journees: 4
formations: 39
paysage:
- 6
- 10
- 14
groupe: 2
skip: 49
rubriques_filename: spip_rubriques_clean.yml
articles_filename: spip_articles_clean.yml
documents_filename: spip_documents.yml
......@@ -16,12 +16,6 @@ import sys
from yaml_cleaner import strip_invalid, remove_null_date
SITE_URL = "http://calcul.math.cnrs.fr"
AUTHORS = "Groupe Calcul"
CATEGORIES = {4: "journees", 39: "formations", 6: "paysage", 10: "paysage", 14: "paysage", 2: "groupe", 49: "skip",
-1: "spip_divers"}
ATTACHMENTS_PREFIX = "attachments/spip/"
yaml = YAML(typ='safe')
SKIP_REASON = {"skip_rub": "belonging to a skipped rubrique",
......@@ -73,7 +67,7 @@ def bold(s):
return re.sub(r"(^|[^{]){{([^}]+)}}([^}]|$)", r"\1**\2**\3", s)
def html_links(s):
def html_link(s, website):
"""Replace html href by the right Pelican (relative) URL"""
def link_replace(matchobj):
......@@ -90,25 +84,26 @@ def html_links(s):
def link_replace_doc(matchobj):
"""Prepend attachment document path with attachment prefix"""
return ATTACHMENTS_PREFIX + matchobj.group(2)
return website.attachments_prefix + matchobj.group(2)
soup = bs4.BeautifulSoup(s, "html.parser")
for link in soup.find_all('a'):
link_url = link.get('href')
if link_url:
new_url = re.sub(r"\A{}/spip.php\?(article|rubrique)([0-9]+)(.*)".format(SITE_URL), link_replace, link_url)
new_url = re.sub(r"\A({}/|)(Documents/.*)".format(SITE_URL), link_replace_doc, new_url)
new_url = re.sub(r"\A{}/spip.php\?(article|rubrique)([0-9]+)(.*)".format(website.site_url), link_replace,
link_url)
new_url = re.sub(r"\A({}/|)(Documents/.*)".format(website.site_url), link_replace_doc, new_url)
link['href'] = new_url
return soup.prettify(formatter=None) # formatter=None to avoid ">" -> ">" conversion
def html_img(s):
def html_img(s, website):
"""Replace html img src by the right Pelican (relative) URL"""
def src_replace(matchobj):
"""Prepend attachment image path with attachment prefix"""
return ATTACHMENTS_PREFIX + matchobj.group(0)
return website.attachments_prefix + matchobj.group(0)
soup = bs4.BeautifulSoup(s, "html.parser")
for img in soup.find_all('img'):
......@@ -156,7 +151,7 @@ def link(s, website):
doc_url = re.match(r"\Adoc([0-9]+)", url)
if doc_url:
# [text->doc#]
new_url = os.path.join(ATTACHMENTS_PREFIX, "IMG", website.doc_index[int(doc_url.group(1))])
new_url = os.path.join(website.attachments_prefix, "IMG", website.doc_index[int(doc_url.group(1))])
else:
art_url = re.match(r"\Aart([0-9]+)", url)
if art_url:
......@@ -178,7 +173,7 @@ def link(s, website):
new_url = os.path.join(f"{category}.html")
else:
# [text->path_to_file]
new_url = os.path.join(ATTACHMENTS_PREFIX, url)
new_url = os.path.join(website.attachments_prefix, url)
new_link = f"[{text}]({new_url})"
return new_link
......@@ -197,7 +192,7 @@ def document(s, website):
"""A call back function to replace a Spip doc by a Pelican link"""
doc_type = matchobj.group(1)
doc_id = int(matchobj.group(2))
url = os.path.join(ATTACHMENTS_PREFIX, "IMG", website.doc_index[doc_id])
url = os.path.join(website.attachments_prefix, "IMG", website.doc_index[doc_id])
if doc_type == 'doc':
return f"[Document]({url})"
else:
......@@ -252,8 +247,8 @@ def horizontal_rule(s):
def spip_to_markdown(s, website):
"""Convert string from Spip format to Pelican markdown format"""
s = html_links(s)
s = html_img(s)
s = html_link(s, website)
s = html_img(s, website)
s = italic(s)
s = bold(s)
s = ordered_list(s)
......@@ -324,7 +319,7 @@ class Article:
self.title = spip_to_markdown(self.title, self.website).strip() # strip to remove any CR at end of string
tags = []
authors = AUTHORS
authors = self.website.authors
content = spip_to_markdown(self.text, self.website)
header = f"""\
title: {self.title}
......@@ -347,38 +342,59 @@ summary: {self.summary}
return self.skip_reason
def reset_output_directories():
"""Erase existing output files and create empty output directories"""
if os.path.exists("content"):
shutil.rmtree("content")
for category in set(CATEGORIES.values()):
if category != 'skip':
os.makedirs(os.path.join("content", category))
class Website:
"""Define a website from Spip data"""
def __init__(self, rubriques_filename, documents_filename, articles_filename):
self.rubriques_filename = rubriques_filename
self.documents_filename = documents_filename
self.articles_filename = articles_filename
def __init__(self, reset_output_dir=True):
self.category_index = {}
self.article_index = {}
self.doc_index = {}
config_filename = "config.yml"
with open(config_filename, 'r') as ymlfile:
cfg = yaml.load(ymlfile)
self.site_url = cfg['site_url']
self.authors = cfg['authors']
self.attachments_prefix = cfg['attachments_prefix']
self.rubriques_filename = cfg['rubriques_filename']
self.documents_filename = cfg['documents_filename']
self.articles_filename = cfg['articles_filename']
self.categories = {-1: "spip_divers"}
for pelican_category, spip_rubrique in cfg['categories'].items():
if type(spip_rubrique) == int:
# this pelican category corresponds to a single rubrique
self.categories[spip_rubrique] = pelican_category
elif type(spip_rubrique) == list:
# this pelican category corresponds to a list of rubriques
for rubrique in spip_rubrique:
self.categories[rubrique] = pelican_category
else:
logger.critical(f"Error in {config_filename}: {pelican_category}: {spip_rubrique}")
if reset_output_dir:
self.reset_output_directories()
def reset_output_directories(self):
"""Erase existing output files and create empty output directories"""
if os.path.exists("content"):
shutil.rmtree("content")
for category in set(self.categories.values()):
if category != 'skip':
os.makedirs(os.path.join("content", category))
def _build_category_index(self):
"""Build the index dictionary: {id_rubrique: category_name}"""
def get_category(id_rubrique):
"""Return category from id_rubrique"""
while id_rubrique not in CATEGORIES:
while id_rubrique not in self.categories:
try:
id_rubrique = parents[id_rubrique]
except KeyError:
id_rubrique = -1
return CATEGORIES[id_rubrique]
return self.categories[id_rubrique]
# Load original rubriques file as a list
with open(self.rubriques_filename, mode='r') as yml_rubriques:
......@@ -444,7 +460,6 @@ class Website:
if __name__ == '__main__':
reset_output_directories()
website = Website("spip_rubriques_clean.yml", "spip_documents.yml", "spip_articles_clean.yml")
website = Website()
website.read_spip()
website.export_to_pelican()
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment