Commit a0147a2a authored by gouarin's avatar gouarin

add rst generator

parent 18c1b8d1
from ruamel.yaml import YAML
from ruamel.yaml.reader import Reader
import re
to_remove = [
def strip_invalid(s):
res = ''
for x in s:
if Reader.NON_PRINTABLE.match(x):
# res += '\\x{:x}'.format(ord(x))
res += x
return res
def clean_titles(s):
Add simple quotes to titles to avoid interpretation of ":" as yaml syntax
def title_replace(matchobj):
"""Return a clean title line"""
title_type =
title =
if title.startswith("|-"):
# Do not replace if multiline content
# Avoid real text simple quote to be interpreted as end of string
title = re.sub(r"'", "''", title)
return f"{title_type}'{title}'"
s = re.sub('^( titre: )(.*)$', title_replace, s, flags=re.MULTILINE)
s = re.sub('^( nom_site: )(.*)$', title_replace, s, flags=re.MULTILINE)
s = re.sub('^( texte: )(.*)$', title_replace, s, flags=re.MULTILINE)
return s
yaml = YAML()
import codecs
last_position = -1
def mixed_decoder(unicode_error):
global last_position
string = unicode_error.object
position = unicode_error.start
new = string.decode("iso-8859-1")
return new[position:unicode_error.end], unicode_error.end
def mixed_decoder_utf8(unicode_error):
global last_position
string = unicode_error.object
position = unicode_error.start
new = string[position:unicode_error.end].encode("utf-8")
return new, unicode_error.end
codecs.register_error("mixed", mixed_decoder)
codecs.register_error("utf8", mixed_decoder_utf8)
with open("./spip_yml/spip_articles.yml", 'r') as stream:
lines =
def remove(match):
return ''
for r in to_remove:
regex = re.compile(f'({r}.*)')
lines = regex.sub(remove, lines)
lines = lines.encode('iso-8859-1', 'utf8').decode('utf-8', 'mixed')
y = yaml.load(clean_titles(strip_invalid(lines)))
with open("./spip_yml/spip_articles_clean.yml", 'w') as stream:
yaml.dump(y, stream)
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment