Commit 50981444 authored by Matthieu Boileau's avatar Matthieu Boileau
Browse files

A first attempt to export rst pelican format

#25
parent 767ab3fd
......@@ -64,6 +64,7 @@ categories:
- 249 # inscrits à la 7ème journée mésocentres
- 151 # Validation d'inscription
- 160 # Liste des inscrits : Workshop "Masse de données : I/O, format de fichier, visualisation et archivage", 13 janvier 2011
- 138 # Confirmation d'inscription
rubriques:
# spip rubriques and spip articles that are not listed above will fall into categories below
evt_sci:
......
......@@ -10,8 +10,10 @@ import argparse
from ruamel.yaml import YAML
import bs4
from colorlog import ColoredFormatter
import ftfy
import logging
import os
import pypandoc
import re
import shutil
import sys
......@@ -44,63 +46,50 @@ logger.addHandler(fh)
logger.addHandler(ch)
def header(s):
"""
SPIP: {{{...}}}
md: ## ...
"""
return re.sub(r"(^|[^{]){{{([^}]+)}}}([^}]|$)", r"\1\n## \2\n\3", s)
def header_extended(s):
"""
SPIP: {{{{{...}}}}}
md: ### ...
"""
return re.sub(r"(^|[^{]){{{{{([^}]+)}}}}}([^}]|$)", r"\1\n### \2\n\3", s)
def italic(s):
"""
SPIP: {...}
md: *...*
"""
return re.sub(r"(^|[^{]){([^}]+)}([^}]|$)", r"\1*\2*\3", s)
def italic_spaces(s):
"""
SPIP: { ... }
md: *...*
"""
return re.sub(r"(^|[^{]){ ([^}]+) }([^}]|$)", r"\1*\2*\3", s)
def italic_firstspace(s):
"""
SPIP: { ... }
md: *...*
"""
return re.sub(r"(^|[^{]){ ([^}]+)}([^}]|$)", r"\1*\2*\3", s)
class SpipToMarkdown:
def __init__(self, website):
"""A generic class to export spip format to a markup language"""
self.website = website
def bold(s):
"""
SPIP: {{...}}
md: **...**
"""
return re.sub(r"(^|[^{]){{([^}]+)}}([^}]|$)", r"\1**\2**\3", s)
def convert(self, s):
"""Convert string from Spip format to Pelican markdown format"""
s = self.document(s)
s = self.html_link(s)
s = self.html_img(s)
s = self.bold_spaces(s)
s = self.italic(s)
s = self.bold(s)
s = self.italic_spaces(s)
s = self.italic_firstspace(s)
s = self.bold(s)
s = self.ordered_list(s)
s = self.unordered_list(s)
s = self.header(s)
s = self.header_extended(s)
s = self.horizontal_rule(s)
s = self.link(s)
return s
def bold_spaces(s):
def document(self, s):
"""
SPIP: {{ ... }}
md: **...**
SPIP: <doc|path> or <img|path>
md: [text](url) or ![](img)
"""
return re.sub(r"(^|[^{]){{ ([^}]+) }}([^}]|$)", r"\1**\2**\3", s)
def doc_replace(matchobj):
"""A call back function to replace a Spip doc by a Pelican link"""
doc_type = matchobj.group(1)
doc_id = int(matchobj.group(2))
url = os.path.join(self.website.attachments_prefix, "IMG", self.website.doc_index[doc_id])
docname = os.path.basename(url)
if doc_type == 'doc':
return f"[{docname}]({url})"
else:
return f"![{docname}]({url})"
return re.sub(r'<(doc|img)([0-9]+)\|.*>', doc_replace, s)
def html_link(s, website):
def html_link(self, s):
"""Replace html href by the right Pelican (relative) URL"""
def link_replace(matchobj):
......@@ -117,26 +106,26 @@ def html_link(s, website):
def link_replace_doc(matchobj):
"""Prepend attachment document path with attachment prefix"""
return website.attachments_prefix + matchobj.group(2)
return self.website.attachments_prefix + matchobj.group(2)
soup = bs4.BeautifulSoup(s, "html.parser")
for link in soup.find_all('a'):
link_url = link.get('href')
if link_url:
new_url = re.sub(r"\A{}/spip.php\?(article|rubrique)([0-9]+)(.*)".format(website.site_url), link_replace,
new_url = re.sub(r"\A{}/spip.php\?(article|rubrique)([0-9]+)(.*)".format(self.website.site_url),
link_replace,
link_url)
new_url = re.sub(r"\A({}/|)(Documents/.*)".format(website.site_url), link_replace_doc, new_url)
new_url = re.sub(r"\A({}/|)(Documents/.*)".format(self.website.site_url), link_replace_doc, new_url)
link['href'] = new_url
return soup.prettify(formatter=None) # formatter=None to avoid ">" -> "&gt;" conversion
def html_img(s, website):
def html_img(self, s):
"""Replace html img src by the right Pelican (relative) URL"""
def src_replace(matchobj):
"""Prepend attachment image path with attachment prefix"""
return website.attachments_prefix + matchobj.group(0)
return self.website.attachments_prefix + matchobj.group(0)
soup = bs4.BeautifulSoup(s, "html.parser")
for img in soup.find_all('img'):
......@@ -147,13 +136,111 @@ def html_img(s, website):
return soup.prettify(formatter=None) # formatter=None to avoid ">" -> "&gt;" conversion
@staticmethod
def bold_spaces(s):
"""
SPIP: {{ ... }}
md: **...**
"""
return re.sub(r"(^|[^{]){{ ([^}]+) }}([^}]|$)", r"\1**\2**\3", s)
@staticmethod
def italic(s):
"""
SPIP: {...}
md: *...*
"""
return re.sub(r"(^|[^{]){([^}]+)}([^}]|$)", r"\1*\2*\3", s)
@staticmethod
def bold(s):
"""
SPIP: {{...}}
md: **...**
"""
return re.sub(r"(^|[^{]){{([^}]+)}}([^}]|$)", r"\1**\2**\3", s)
@staticmethod
def italic_spaces(s):
"""
SPIP: { ... }
md: *...*
"""
return re.sub(r"(^|[^{]){ ([^}]+) }([^}]|$)", r"\1*\2*\3", s)
@staticmethod
def italic_firstspace(s):
"""
SPIP: { ... }
md: *...*
"""
return re.sub(r"(^|[^{]){ ([^}]+)}([^}]|$)", r"\1*\2*\3", s)
@staticmethod
def ordered_list(s):
"""
SPIP: - or -# in 1rst level, -## for second level, etc.
md: 1. with 4-space indents
"""
# Add a carriage return to separate from a possible non listed element
s = re.sub(r"(^|\n)-# ", r"\n\g<1>1. ", s)
# --- Remove excessive carriage return
s = re.sub(r"\n1. ([^\n]*)\n\n1. ", r"\n1. \1\n1. ", s)
s = re.sub(r"\n1. ([^\n]*)\n\n1. ", r"\n1. \1\n1. ", s)
# ---
s = re.sub(r"(^|\n)-## ", r"\1 1. ", s)
s = re.sub(r"(^|\n)-### ", r"\1 1. ", s)
s = re.sub(r"(^|\n)-#### ", r"\1 1. ", s)
return s
@staticmethod
def unordered_list(s):
"""
SPIP: - or -* in 1rst level, -** for second level, etc.
md: - with 4-space indents
"""
# Add a carriage return to separate from a possible non listed element
s = re.sub(r"(^|\n)-\*? ", r"\n\1- ", s)
# --- Remove excessive carriage return
s = re.sub(r"\n- ([^\n]*)\n\n- ", r"\n- \1\n- ", s)
s = re.sub(r"\n- ([^\n]*)\n\n- ", r"\n- \1\n- ", s)
# ---
s = re.sub(r"(^|\n)-\*\* ", r"\1 - ", s)
s = re.sub(r"(^|\n)-\*\*\* ", r"\1 - ", s)
s = re.sub(r"(^|\n)-\*\*\*\* ", r"\1 - ", s)
return s
@staticmethod
def header(s):
"""
SPIP: {{{...}}}
md: ## ...
"""
return re.sub(r"(^|[^{]){{{([^}]+)}}}([^}]|$)", r"\1\n## \2\n\3", s)
@staticmethod
def header_extended(s):
"""
SPIP: {{{{{...}}}}}
md: ### ...
"""
return re.sub(r"(^|[^{]){{{{{([^}]+)}}}}}([^}]|$)", r"\1\n### \2\n\3", s)
@staticmethod
def horizontal_rule(s):
"""
SPIP: ---- with no carriage return before and after
md: \n---\n
"""
s = re.sub(r"<hr>", r"\n---\n", s)
s = re.sub(r"----", r"\n---\n", s)
return s
def link(s, website):
def link(self, s):
"""
SPIP: [text->url] or [text -> url]
md: [text](url) or <url> if text is empty
"""
def link_replace(matchobj):
"""A call back function to replace a Spip link by a Pelican link"""
......@@ -166,7 +253,7 @@ def link(s, website):
msg += f" text: {text}\n"
msg += f" url: {url}"
logger.warning(msg)
website.nullified_urls += 1
self.website.nullified_urls += 1
return ""
# Remove "mailto:" prefix from URL
......@@ -185,106 +272,38 @@ def link(s, website):
art_url = re.match(r"\A(art|rub|brev)([0-9]+)", url)
if doc_url:
# [text->doc#]
new_url = os.path.join(website.attachments_prefix, "IMG", website.doc_index[int(doc_url.group(1))])
new_url = os.path.join(self.website.attachments_prefix, "IMG",
self.website.doc_index[int(doc_url.group(1))])
elif art_url:
# [text->art#,rub#,brev#]
art_id = art_url.group(0)
try:
new_url = "{filename}/" + website.article_index[art_id]
new_url = "{filename}/" + self.website.article_index[art_id]
except KeyError:
new_url = nullify_url(art_id, text, url)
else:
# [text->path_to_file]
new_url = os.path.join(website.attachments_prefix, url)
new_url = os.path.join(self.website.attachments_prefix, url)
if new_url:
new_link = f"[{text}]({new_url})"
else:
new_link = f"{text}" # URL is nullified
return new_link
s = re.sub(r"\[([^\]]*)->([^\]]+)\]", link_replace, s)
return s
return re.sub(r"\[([^\]]*)->([^\]]+)\]", link_replace, s)
def document(s, website):
"""
SPIP: <doc|path> or <img|path>
md: [text](url) or ![](img)
"""
class SpipToRst(SpipToMarkdown):
def doc_replace(matchobj):
"""A call back function to replace a Spip doc by a Pelican link"""
doc_type = matchobj.group(1)
doc_id = int(matchobj.group(2))
url = os.path.join(website.attachments_prefix, "IMG", website.doc_index[doc_id])
docname = os.path.basename(url)
if doc_type == 'doc':
return f"[{docname}]({url})"
def convert(self, s, preserve_line_breaks=False):
s = super().convert(s)
if preserve_line_breaks:
extra_args = ['--wrap=preserve']
else:
return f"![{docname}]({url})"
s = re.sub(r'<(doc|img)([0-9]+)\|.*>', doc_replace, s)
return s
def unordered_list(s):
"""
SPIP: - or -* in 1rst level, -** for second level, etc.
md: - with 4-space indents
"""
# Add a carriage return to separate from a possible non listed element
s = re.sub(r"(^|\n)-\*? ", r"\n\1- ", s)
# --- Remove excessive carriage return
s = re.sub(r"\n- ([^\n]*)\n\n- ", r"\n- \1\n- ", s)
s = re.sub(r"\n- ([^\n]*)\n\n- ", r"\n- \1\n- ", s)
# ---
s = re.sub(r"(^|\n)-\*\* ", r"\1 - ", s)
s = re.sub(r"(^|\n)-\*\*\* ", r"\1 - ", s)
s = re.sub(r"(^|\n)-\*\*\*\* ", r"\1 - ", s)
return s
def ordered_list(s):
"""
SPIP: - or -# in 1rst level, -## for second level, etc.
md: 1. with 4-space indents
"""
# Add a carriage return to separate from a possible non listed element
s = re.sub(r"(^|\n)-# ", r"\n\g<1>1. ", s)
# --- Remove excessive carriage return
s = re.sub(r"\n1. ([^\n]*)\n\n1. ", r"\n1. \1\n1. ", s)
s = re.sub(r"\n1. ([^\n]*)\n\n1. ", r"\n1. \1\n1. ", s)
# ---
s = re.sub(r"(^|\n)-## ", r"\1 1. ", s)
s = re.sub(r"(^|\n)-### ", r"\1 1. ", s)
s = re.sub(r"(^|\n)-#### ", r"\1 1. ", s)
return s
def horizontal_rule(s):
"""
SPIP: ---- with no carriage return before and after
md: \n---\n
"""
s = re.sub(r"<hr>", r"\n---\n", s)
return re.sub(r"----", r"\n---\n", s)
def spip_to_markdown(s, website):
"""Convert string from Spip format to Pelican markdown format"""
s = document(s, website)
s = html_link(s, website)
s = html_img(s, website)
s = bold_spaces(s)
s = italic(s)
s = bold(s)
s = italic_spaces(s)
s = italic_firstspace(s)
s = bold(s)
s = ordered_list(s)
s = unordered_list(s)
s = header(s)
s = header_extended(s)
s = horizontal_rule(s)
s = link(s, website)
extra_args = ['--wrap=auto']
s = pypandoc.convert_text(s, 'rst', format='md', extra_args=extra_args)
s = re.sub(r"\%7Bfilename\%7D", r"{filename}", s) # Correct unwanted pandoc translation
return s
......@@ -328,8 +347,8 @@ class Article:
if not self.skip_reason:
self.mdprefix = f"spip_{self.type}-{self.short_id}"
self.mdpath = os.path.join(self.category, self.mdprefix + ".md")
self.prefix = f"spip_{self.type}-{self.short_id}"
self.path = os.path.join(self.category, f"{self.prefix}.{self.website.ml_type}")
try:
self.date = spip_article['date']
except KeyError:
......@@ -346,6 +365,54 @@ class Article:
self.tags = [self.type]
if self.website.ml_type == 'md':
# Instanciate a spip -> markdown translator
s2md = SpipToMarkdown(self.website)
self.convert = s2md.convert
def convert_title(title):
return self.convert(title).strip() # strip to remove any CR at end of string
self.convert_title = convert_title
self.get_header = self.get_header_markdown
elif self.website.ml_type == 'rst':
# Instanciate a spip -> rst translator
s2rst = SpipToRst(self.website)
self.convert = s2rst.convert
def convert_title(title):
return self.convert(title, preserve_line_breaks=True).strip()
self.convert_title = convert_title
self.get_header = self.get_header_rst
else:
exit(f'Unknown markup language type: {self.website.ml_type}!')
def get_header_markdown(self):
header = f"""\
title: {self.title}
date: {self.date}
modified: {self.modified}
category: {self.category}
tags: {self.tags}
slug: {self.prefix}
authors: {self.authors}
summary: {self.summary}
"""
return header
def get_header_rst(self):
print(self.title)
title = f"{self.title}\n{'#'*len(self.title)}\n\n"
header = title + f"""\
:date: {self.date}
:modified: {self.modified}
:category: {self.category}
:tags: {self.tags}
:slug: {self.prefix}
:authors: {self.authors}
:summary: {self.summary}
"""
return header
def export_to_pelican(self):
"""
Content of a markdown article should look like:
......@@ -366,23 +433,11 @@ class Article:
if self.skip_reason:
logger.warning(f" WARNING: skipping because {self.skip_reason}")
else:
self.title = spip_to_markdown(self.title, self.website).strip() # strip to remove any CR at end of string
header = f"""\
title: {self.title}
date: {self.date}
modified: {self.modified}
category: {self.category}
tags: {self.tags}
slug: {self.mdprefix}
authors: {self.authors}
summary: {self.summary}
"""
content = spip_to_markdown(self.text, self.website)
markdown = header + content
self.title = self.convert_title(self.title)
content = ftfy.fix_encoding(self.convert(self.text))
markdown = self.get_header() + content
export_path = os.path.join("content", self.mdpath)
export_path = os.path.join("content", self.path)
with open(export_path, 'w') as f:
f.write(markdown)
logger.debug(f" --> {export_path}")
......@@ -403,10 +458,11 @@ class Website:
with open(filename, mode='r') as yml_file:
return yaml.load(remove_null_date(strip_invalid(yml_file)))
def __init__(self, reset_output_dir=True, include_breves=False):
def __init__(self, reset_output_dir=True, include_breves=False, ml_type='md'):
self.reset_output_dir = reset_output_dir
self.include_breves = include_breves
self.ml_type = ml_type
self.rubrique_to_category = {}
self.article_index = {}
self.doc_index = {}
......@@ -578,7 +634,7 @@ class Website:
article = Article(spip_article, spip_type, self)
self.articles.append(article)
if not article.skip_reason:
self.article_index[article.id] = article.mdpath
self.article_index[article.id] = article.path
add_articles(self.articles_filename, 'article')
add_articles(self.rubriques_filename, 'rubrique')
......@@ -629,12 +685,14 @@ def parse_cl_args():
help="List Spip breves corresponding to given rubrique id")
parser.add_argument('-c', '--convert', action='store_true', default=True, help="Convert to Pelican")
parser.add_argument('-ib', '--include_breves', action='store_true', help="Include breve in conversion")
parser.add_argument('-ml', '--markup', metavar="language", default='md', type=str,
help="Set markup language (md or rst)")
return parser.parse_args()
if __name__ == '__main__':
args = parse_cl_args()
website = Website(include_breves=args.include_breves)
website = Website(include_breves=args.include_breves, ml_type=args.markup)
website.read_spip()
if args.rubriques:
# Show only Spip rubrique structure
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment