Commit a0147a2a authored by gouarin's avatar gouarin
Browse files

add rst generator

parent 18c1b8d1
from ruamel.yaml import YAML
from ruamel.yaml.reader import Reader
import re
to_remove = [
'surtitre:',
'soustitre:',
'chapo:',
'maj:',
'export:',
'visites:',
'referers:',
'popularite:',
'accepter_forum:',
'date_modif:',
'langue_choisie:',
'id_trad:',
'id_version:',
'nom_site:',
'url_site:',
'virtuel:',
'date_redac:'
]
def strip_invalid(s):
res = ''
for x in s:
if Reader.NON_PRINTABLE.match(x):
# res += '\\x{:x}'.format(ord(x))
continue
res += x
return res
def clean_titles(s):
"""
Add simple quotes to titles to avoid interpretation of ":" as yaml syntax
"""
def title_replace(matchobj):
"""Return a clean title line"""
title_type = matchobj.group(1)
title = matchobj.group(2)
if title.startswith("|-"):
# Do not replace if multiline content
return matchobj.group(0)
else:
# Avoid real text simple quote to be interpreted as end of string
title = re.sub(r"'", "''", title)
return f"{title_type}'{title}'"
s = re.sub('^( titre: )(.*)$', title_replace, s, flags=re.MULTILINE)
s = re.sub('^( nom_site: )(.*)$', title_replace, s, flags=re.MULTILINE)
s = re.sub('^( texte: )(.*)$', title_replace, s, flags=re.MULTILINE)
return s
yaml = YAML()
import codecs
last_position = -1
def mixed_decoder(unicode_error):
global last_position
string = unicode_error.object
position = unicode_error.start
new = string.decode("iso-8859-1")
return new[position:unicode_error.end], unicode_error.end
def mixed_decoder_utf8(unicode_error):
global last_position
string = unicode_error.object
position = unicode_error.start
new = string[position:unicode_error.end].encode("utf-8")
return new, unicode_error.end
codecs.register_error("mixed", mixed_decoder)
codecs.register_error("utf8", mixed_decoder_utf8)
with open("./spip_yml/spip_articles.yml", 'r') as stream:
lines = stream.read()
def remove(match):
return ''
for r in to_remove:
regex = re.compile(f'({r}.*)')
lines = regex.sub(remove, lines)
lines = lines.encode('iso-8859-1', 'utf8').decode('utf-8', 'mixed')
y = yaml.load(clean_titles(strip_invalid(lines)))
with open("./spip_yml/spip_articles_clean.yml", 'w') as stream:
yaml.dump(y, stream)
......@@ -9,6 +9,7 @@ import anytree
import argparse
from ruamel.yaml import YAML
import bs4
from bs4 import BeautifulSoup
from colorlog import ColoredFormatter
import ftfy
import logging
......@@ -299,17 +300,329 @@ class SpipToMarkdown:
class SpipToRst(SpipToMarkdown):
"""A class to export spip article format to a ReStructuredText Pelican article"""
def convert(self, s, preserve_line_breaks=False):
"""Apply a pandoc conversion to markdown format"""
s = super().convert(s)
if preserve_line_breaks:
extra_args = ['--wrap=preserve']
else:
extra_args = ['--wrap=auto']
s = pypandoc.convert_text(s, 'rst', format='md', extra_args=extra_args)
s = re.sub(r"%7Bfilename%7D", r"{filename}", s) # Correct unwanted pandoc translation
def __init__(self, website):
self.website = website
def convert(self, s):
"""Convert string from Spip format to Pelican markdown format"""
# # s = self.html_link(s)
# # s = self.html_img(s)
s = self.ordered_list(s)
s = self.unordered_list(s)
# s = self.horizontal_rule(s)
s = self.fix_li(s)
s = self.convert_html(s)
s = self.remove_font(s)
s = self.bold(s)
s = self.italic(s)
s = self.link(s)
s = self.remove_space(s)
s = self.remove_empty_link(s)
s = self.document(s)
s = self.fix_table(s)
s = self.remove_blank(s)
s = self.header(s)
s = self.header_extended(s)
return s
def document(self, s):
"""
SPIP: <doc|path> or <img|path>
md: [text](url) or ![](img)
"""
def doc_rst(match):
doc_type = match[1]
doc_id = int(match[2])
url = os.path.join(self.website.attachments_prefix, "IMG", self.website.doc_index[doc_id])
print(self.website.doc_index[doc_id])
docname = os.path.basename(url)
if doc_type == 'doc':
return f'`{docname} <{url}>`__'
else:
return f'\n\n..image:: {url}\n\n'
regex = re.compile(r'<(doc|img)([0-9]+)\|.*>')
return regex.sub(doc_rst, s)
def html_link(self, s):
"""Replace html href by the right Pelican (relative) URL"""
def link_replace(matchobj):
"""A call back function to replace a Spip absolute link by a relative link to Pelican file"""
spip_type = matchobj.group(1)
id_art = int(matchobj.group(2))
anchorobj = re.match(r"#(.*)", matchobj.group(3))
if anchorobj:
new_url = anchorobj.group(0)
else:
new_url = f"spip_{spip_type}-{id_art}.html"
return new_url
def link_replace_doc(matchobj):
"""Prepend attachment document path with attachment prefix"""
return self.website.attachments_prefix + matchobj.group(2)
soup = bs4.BeautifulSoup(s, "html.parser")
for link in soup.find_all('a'):
link_url = link.get('href')
if link_url:
new_url = re.sub(r"\A{}/spip.php\?(article|rubrique)([0-9]+)(.*)".format(self.website.site_url),
link_replace,
link_url)
new_url = re.sub(r"\A({}/|)(Documents/.*)".format(self.website.site_url), link_replace_doc, new_url)
link['href'] = new_url
return soup.prettify(formatter=None) # formatter=None to avoid ">" -> "&gt;" conversion
def html_img(self, s):
"""Replace html img src by the right Pelican (relative) URL"""
def src_replace(matchobj):
"""Prepend attachment image path with attachment prefix"""
return self.website.attachments_prefix + matchobj.group(0)
soup = bs4.BeautifulSoup(s, "html.parser")
for img in soup.find_all('img'):
img_src = img.get('src')
if img_src:
new_src = re.sub(r"\ADocuments/.*", src_replace, img_src)
img['src'] = new_src
return soup.prettify(formatter=None) # formatter=None to avoid ">" -> "&gt;" conversion
def fix_table(self, s):
def remove_bad_char(match):
return '| |'
regex = re.compile('\|(\^|<)\|')
s = regex.sub(remove_bad_char, s)
return re.sub(r'\|', '', s)
def fix_li(self, s):
soup = BeautifulSoup(s, 'html.parser')
for li in soup.find_all('li'):
if isinstance(li.contents[0], str):
text = li.contents[0].replace('\n', '')
li.replace_with(text)
return soup.prettify(formatter=None)
def remove_space(self, s):
new = []
for l in s.split("\n"):
new.append(l.strip())
return '\n'.join(new)
def remove_blank(self, s):
new = []
for l in s.split("\n"):
if l.lstrip().startswith('-'):
new.append(l+'\n')
else:
new.append(l.lstrip())
return '\n'.join(new)
def convert_html(self, lines):
soup = BeautifulSoup(lines, 'html.parser')
for html in soup.find_all('ul'):
s = pypandoc.convert_text(html, 'rst', format='html', extra_args=['--wrap=preserve'])
html.replace_with(s)
for html in soup.find_all('a'):
s = pypandoc.convert_text(html, 'rst', format='html', extra_args=['--wrap=preserve'])
html.replace_with(s)
return soup.prettify(formatter=None)
@staticmethod
def remove_font(s):
def font_rst(match):
return ' '
regex = re.compile(r'(<font .*>)')
s = regex.sub(font_rst, s)
regex = re.compile(r'(</font>)')
s = regex.sub(font_rst, s)
regex = re.compile(r'(<html>)')
s = regex.sub(font_rst, s)
regex = re.compile(r'(</html>)')
s = regex.sub(font_rst, s)
regex = re.compile(r'(<hr/>)')
return regex.sub(font_rst, s)
@staticmethod
def bold(s):
"""
SPIP: {{ ... }}
md: **...**
"""
def bold_rst(match):
text = match[2].strip()
return f'**{text}** '
regex = re.compile(r'({{2})([^}]+)(}{2})')
new = []
for l in s.split("\n"):
new.append(regex.sub(bold_rst, l))
return '\n'.join(new)
@staticmethod
def italic(s):
"""
SPIP: {...}
md: *...*
"""
def italic_rst(match):
text = match[2].strip()
return f'*{text}* '
regex = re.compile(r'({)([^}]*)(})')
new = []
for l in s.split("\n"):
new.append(regex.sub(italic_rst, l))
return '\n'.join(new)
@staticmethod
def ordered_list(s):
"""
SPIP: - or -# in 1rst level, -## for second level, etc.
md: 1. with 4-space indents
"""
def ordered_rst(match):
indent = ' '*4*(match[1].count('*')-1)
return f'\n{indent}- {match[2]}\n'
regex = re.compile(r'^\s*-\s*(\#*)(.*)')
new = []
for l in s.split("\n"):
new.append(regex.sub(ordered_rst, l))
return '\n'.join(new)
@staticmethod
def remove_empty_link(s):
def replace(match):
return f'{match[1]}'
regex = re.compile(r'`(.*)<>`__')
return regex.sub(replace, s)
@staticmethod
def unordered_list(s):
"""
SPIP: - or -* in 1rst level, -** for second level, etc.
md: - with 4-space indents
"""
def unordered_rst(match):
indent = ' '*4*(match[1].count('*')-1)
text = match[2].strip()
return f'\n{indent}- {text}\n'
regex = re.compile('^\s*-\s*(\**)(.*)')
new = []
for l in s.split("\n"):
new.append(regex.sub(unordered_rst, l))
return '\n'.join(new)
@staticmethod
def header(s):
"""
SPIP: {{{...}}}
md: ## ...
"""
def header_rst(match):
text = match[2].strip()
return text + '\n' + '='*len(text) + '\n'
regex = re.compile(r'({{3})([^}]*)(}{3})')
return regex.sub(header_rst, s)
@staticmethod
def header_extended(s):
"""
SPIP: {{{{{...}}}}}
md: ### ...
"""
def header_rst(match):
text = match[2].strip()
return text + '\n' + '-'*len(text) + '\n'
regex = re.compile(r'({{5})([^}]*)(}{5})')
return regex.sub(header_rst, s)
@staticmethod
def horizontal_rule(s):
"""
SPIP: ---- with no carriage return before and after
md: \n---\n
"""
s = re.sub(r"<hr>", r"\n---\n", s)
s = re.sub(r"----", r"\n---\n", s)
return s
def link(self, s):
"""
SPIP: [text->url] or [text -> url]
md: [text](url) or <url> if text is empty
"""
def nullify_url(id_art, text, url):
"""Throw WARNING message and return empty URL"""
msg = f" WARNING: nullify link to non existing article {id_art}\n"
msg += f" text: {text}\n"
msg += f" url: {url}"
logger.warning(msg)
self.website.nullified_urls += 1
return ""
def link_rst(match):
text = match[1]
link = match[2].strip()
if text == '' or text == link:
return f'{link}'
email = re.match('mailto:(.*)', link)
if email:
return f'{email.group(1).strip()}'
http_url = re.match(r'http', link)
if http_url:
return f'`{text} <{link}>`__'
doc_url = re.match(r'doc([0-9]+)', link)
if doc_url:
link = os.path.join(self.website.attachments_prefix, "IMG",
self.website.doc_index[int(doc_url.group(1))])
return f'`{text} <{link}>`__'
art_url = re.match(r"(art|rub|brev)([0-9]+)", link)
if art_url:
art_id = art_url.group(0)
try:
link = self.website.article_index[art_id]
except KeyError:
link = nullify_url(art_id, text, link)
return f'`{text} <{link}>`__'
link = os.path.join(self.website.attachments_prefix, link)
return f'`{text} <{link}>`__'
regex = re.compile(r'\[([^]]*)\s*-\s*>\s*([^]]*)\]')
return regex.sub(link_rst, s)
class Article:
"""A generic class for a single Spip article or rubrique to be converted into a Pelican article file"""
......@@ -319,6 +632,7 @@ class Article:
self.type = spip_type
self.website = website
id_tag = 'id_' + self.type
print(spip_article)
self.short_id = spip_article[id_tag]
self.id = f"{SHORTEN[self.type]}{self.short_id}"
self.title = spip_article['titre']
......@@ -451,13 +765,14 @@ class ArticleRst(Article):
def convert_title(self, title):
"""Prevent line breaks when converting title"""
return self.convert(title, preserve_line_breaks=True).strip()
return self.convert(title).strip()
def get_header(self):
"""Return header in rst format"""
print(self.title)
title = f"{self.title}\n{'#'*len(self.title)}\n\n"
header = title + f"""\
#title = f"{self.title}\n{'#'*len(self.title)}\n\n"
header = f"""\
:title: {self.title}
:date: {self.date}
:modified: {self.modified}
:category: {self.category}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment