Commit c2aa06f9 authored by Matthieu Boileau's avatar Matthieu Boileau
Browse files

Fix #14

parent ec93e330
site_url: http://calcul.math.cnrs.fr
authors: Groupe Calcul
default_author: Webmaster
attachments_prefix: attachments/spip/
categories:
journees: 4
......@@ -13,3 +13,5 @@ categories:
rubriques: spip_rubriques_clean.yml
articles: spip_articles_clean.yml
documents: spip_documents.yml
authors: spip_auteurs_clean.yml
authors_links: spip_auteurs_liens.yml
......@@ -32,7 +32,7 @@ fh.setLevel(logging.INFO)
# create console handler with higher log level and colored output
ch = logging.StreamHandler(sys.stdout)
ch.setLevel(logging.DEBUG)
LOGFORMAT = " %(log_color)s%(message)s%(reset)s"
LOGFORMAT = "%(log_color)s%(message)s%(reset)s"
color_formatter = ColoredFormatter(LOGFORMAT)
ch.setFormatter(color_formatter)
......@@ -41,8 +41,6 @@ logger.addHandler(fh)
logger.addHandler(ch)
# TODO: handle mix of italic and bold : { {{Marc Poinot}} (ONERA)}
def header(s):
"""
SPIP: {{{...}}}
......@@ -188,8 +186,8 @@ def link(s, website):
art_url = re.match(r"\Aart([0-9]+)", url)
if art_url:
# [text->art#]
id_art = int(art_url.group(1))
try:
id_art = int(art_url.group(1))
new_url = "{filename}/" + website.article_index[id_art]
except KeyError:
new_url = nullify_url("non existing article", id_art, text, url)
......@@ -334,6 +332,11 @@ class Article:
self.summary = spip_article['descriptif']
self.text = spip_article['texte']
try:
self.authors = self.website.author_index[self.type][self.id]
except KeyError:
self.authors = self.website.default_author
def export_to_pelican(self):
"""
Content of a markdown article should look like:
......@@ -357,8 +360,6 @@ class Article:
self.title = spip_to_markdown(self.title, self.website).strip() # strip to remove any CR at end of string
tags = []
authors = self.website.authors
content = spip_to_markdown(self.text, self.website)
header = f"""\
title: {self.title}
date: {self.date}
......@@ -366,10 +367,11 @@ modified: {self.modified}
category: {self.category}
tags: {tags}
slug: {self.mdprefix}
authors: {authors}
authors: {self.authors}
summary: {self.summary}
"""
content = spip_to_markdown(self.text, self.website)
markdown = header + content
export_path = os.path.join("content", self.mdpath)
......@@ -388,22 +390,33 @@ class Website:
def __missing__(self, key):
return 0
@staticmethod
def _load_and_clean_yaml(filename):
"""Load yaml file filename, clean it and return a dictionary"""
with open(filename, mode='r') as yml_file:
return yaml.load(remove_null_date(strip_invalid(yml_file)))
def __init__(self, reset_output_dir=True):
self.category_index = {}
self.article_index = {}
self.doc_index = {}
self.author_index = {}
self.nullified_urls = self.MissingKeyDict()
self.articles = []
config_filename = "config.yml"
with open(config_filename, 'r') as ymlfile:
cfg = yaml.load(ymlfile)
self.site_url = cfg['site_url']
self.authors = cfg['authors']
self.default_author = cfg['default_author']
self.attachments_prefix = cfg['attachments_prefix']
self.rubriques_filename = cfg['rubriques']
self.documents_filename = cfg['documents']
self.articles_filename = cfg['articles']
self.authors_filename = cfg['authors']
self.authors_links_filename = cfg['authors_links']
self.categories = {-1: "spip_divers"}
for pelican_category, spip_rubrique in cfg['categories'].items():
if type(spip_rubrique) == int:
......@@ -441,8 +454,8 @@ class Website:
return self.categories[id_rubrique]
# Load original rubriques file as a list
with open(self.rubriques_filename, mode='r') as yml_rubriques:
rubriques = yaml.load(yml_rubriques.read())
with open(self.rubriques_filename, mode='r') as yml_file:
rubriques = yaml.load(yml_file.read())
parents = {rubrique['id_rubrique']: rubrique['id_parent'] for rubrique in rubriques}
self.category_index = {rubrique['id_rubrique']: get_category(rubrique['id_rubrique'])
......@@ -452,10 +465,27 @@ class Website:
"""Build the index dictionary: {id_doc: file_path}"""
# Load original document file as a list
with open(self.documents_filename, mode='r') as yml_doc:
docs = yaml.load(remove_null_date(strip_invalid(yml_doc)))
docs = self._load_and_clean_yaml(self.documents_filename)
self.doc_index = {doc['id_document']: doc['fichier'] for doc in docs}
def _build_author_index(self):
"""Build the index dictionary: {spip_type: art_id: author_name}"""
# Load author file as a list
authors = self._load_and_clean_yaml(self.authors_filename)
author_name_index = {author['id_auteur']: author['nom'] for author in authors}
# Load article/author file as a list
authors_links = self._load_and_clean_yaml(self.authors_links_filename)
for authors_link in authors_links:
spip_type = authors_link['objet']
art_id = authors_link['id_objet']
author_id = authors_link['id_auteur']
if spip_type not in self.author_index.keys():
self.author_index[spip_type] = {}
self.author_index[spip_type][art_id] = author_name_index[author_id]
def _build_articles(self):
"""
Build:
......@@ -473,20 +503,23 @@ class Website:
if not article.skip_reason:
self.article_index[article.id] = article.mdpath
self.articles = []
self.article_index = {}
add_articles(self.articles_filename, 'article')
add_articles(self.rubriques_filename, 'rubrique')
def read_spip(self):
"""Read spip yaml files to build useful indices and article list"""
logger.debug("-------")
logger.debug("Loading Spip data")
self._build_category_index()
self._build_doc_index()
self._build_author_index()
self._build_articles()
def export_to_pelican(self):
"""Loop on Spip articles to convert them into Pelican format"""
logger.debug("-------")
logger.debug("Exporting to Pelican")
processed = []
for article in self.articles:
skip_reason = article.export_to_pelican()
......
This diff is collapsed.
This diff is collapsed.
......@@ -27,7 +27,7 @@ def remove_null_date(s):
"""
Remove "date:", "date_tmp:", etc. if equals to 0000-00-00 00:00:00 (otherwise yaml.load() would fail)
"""
s = re.sub(r'^ date.*: 0000-00-00 00:00:00$', r'', s, flags=re.MULTILINE)
s = re.sub(r'(^ date|^ en_ligne).*: 0000-00-00 00:00:00$', r'', s, flags=re.MULTILINE)
return s
......@@ -37,12 +37,14 @@ def clean_titles(s):
"""
def title_replace(matchobj):
"""Return a clean title line"""
title = matchobj.group(1)
title_type = matchobj.group(1)
title = matchobj.group(2)
title = re.sub(r"'", "''", title) # Avoid real text simple quote to be interpreted as end of string
return f" titre: '{title}'"
return f"{title_type}'{title}'"
res = re.sub('^ titre: (.*)$', title_replace, s, flags=re.MULTILINE)
return res
s = re.sub('^( titre: )(.*)$', title_replace, s, flags=re.MULTILINE)
s = re.sub('^( nom_site: )(.*)$', title_replace, s, flags=re.MULTILINE)
return s
def force_encode(line, iline, codecs=('cp1252', 'utf8')):
......@@ -91,5 +93,6 @@ def clean_yaml(yml_filename):
if __name__ == '__main__':
clean_yaml("spip_auteurs.yml")
clean_yaml("spip_articles.yml")
clean_yaml("spip_rubriques.yml")
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment