Commit 257c3bbe authored by Matthieu Boileau's avatar Matthieu Boileau

Merge yaml_cleaner.py into process_yaml.py

Fix #19
parent d9627e48
......@@ -8,9 +8,10 @@ import argparse
import codecs
import itertools as it
import os
import re
from ruamel.yaml import YAML
from ruamel.yaml.reader import Reader
import shutil
from yaml_cleaner import clean_yaml
SPIPFILES = "spip_auteurs.yml", \
"spip_auteurs_liens.yml", \
......@@ -21,6 +22,93 @@ SPIPFILES = "spip_auteurs.yml", \
OUTPUTDIR = "spip_yml"
def strip_invalid(s):
"""
Filter characters not allowed by YAML specifications
"""
res = ''
for x in s:
if Reader.NON_PRINTABLE.match(x):
#res += '\\x{:x}'.format(ord(x))
continue
res += x
return res
def remove_null_date(s):
"""
Remove "date:", "date_tmp:", etc. if equals to 0000-00-00 00:00:00 (otherwise yaml.load() would fail)
"""
s = re.sub(r'(^ date|^ en_ligne).*: 0000-00-00 00:00:00$', r'', s, flags=re.MULTILINE)
return s
def clean_titles(s):
"""
Add simple quotes to titles to avoid interpretation of ":" as yaml syntax
"""
def title_replace(matchobj):
"""Return a clean title line"""
title_type = matchobj.group(1)
title = matchobj.group(2)
if title.startswith("|-"):
# Do not replace if multiline content
return matchobj.group(0)
else:
# Avoid real text simple quote to be interpreted as end of string
title = re.sub(r"'", "''", title)
return f"{title_type}'{title}'"
s = re.sub('^( titre: )(.*)$', title_replace, s, flags=re.MULTILINE)
s = re.sub('^( nom_site: )(.*)$', title_replace, s, flags=re.MULTILINE)
s = re.sub('^( texte: )(.*)$', title_replace, s, flags=re.MULTILINE)
return s
def force_encode(line, iline, codecs=('cp1252', 'utf8')):
for i in codecs:
try:
return line.decode('utf-8').encode(i).decode()
except (UnicodeDecodeError, UnicodeEncodeError) as e:
print(f"Line = {iline} Error = {e}")
def clean_yaml(yml_filename):
"""read a filename.yml and write filename_clean.yml"""
base_filename = os.path.splitext(yml_filename)[0]
output_filename = base_filename + "_clean.yml"
print(f">>> Cleaning {yml_filename} -> {output_filename}")
yaml = YAML(typ='safe')
# Load original yaml as a list
with open(yml_filename, mode='r') as yml_file:
s = remove_null_date(clean_titles(strip_invalid(yml_file.read())))
yml_list = yaml.load(s)
# Dump to file
yml_fn_tmp = base_filename + "_tmp.yml"
with open(yml_fn_tmp, mode='w') as yml_file_tmp:
yaml.dump(yml_list, yml_file_tmp)
# Reopen file to enforce encoding
with open(yml_fn_tmp, mode='rb') as yml_file_tmp:
iline = 0
yml_clean = ""
for line in yml_file_tmp:
iline += 1
newline = force_encode(line, iline)
yml_clean += newline
os.remove(yml_fn_tmp)
# Load the clean yaml content
yml_list_new = yaml.load(yml_clean)
# Dump clean yaml to file
with open(base_filename + "_clean.yml", mode='w') as yml_file_clean:
yaml.dump(yml_list_new, yml_file_clean)
def reset_output_directory():
"""Erase existing output files and create empty output directories"""
if os.path.exists(OUTPUTDIR):
......
......@@ -15,7 +15,7 @@ import os
import re
import shutil
import sys
from yaml_cleaner import strip_invalid, remove_null_date
from process_yaml import strip_invalid, remove_null_date
yaml = YAML(typ='safe')
......
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Load YAML file
"""
from ruamel.yaml import YAML
from ruamel.yaml.reader import Reader
import os
import re
def strip_invalid(s):
"""
Filter characters not allowed by YAML specifications
"""
res = ''
for x in s:
if Reader.NON_PRINTABLE.match(x):
#res += '\\x{:x}'.format(ord(x))
continue
res += x
return res
def remove_null_date(s):
"""
Remove "date:", "date_tmp:", etc. if equals to 0000-00-00 00:00:00 (otherwise yaml.load() would fail)
"""
s = re.sub(r'(^ date|^ en_ligne).*: 0000-00-00 00:00:00$', r'', s, flags=re.MULTILINE)
return s
def clean_titles(s):
"""
Add simple quotes to titles to avoid interpretation of ":" as yaml syntax
"""
def title_replace(matchobj):
"""Return a clean title line"""
title_type = matchobj.group(1)
title = matchobj.group(2)
if title.startswith("|-"):
# Do not replace if multiline content
return matchobj.group(0)
else:
# Avoid real text simple quote to be interpreted as end of string
title = re.sub(r"'", "''", title)
return f"{title_type}'{title}'"
s = re.sub('^( titre: )(.*)$', title_replace, s, flags=re.MULTILINE)
s = re.sub('^( nom_site: )(.*)$', title_replace, s, flags=re.MULTILINE)
s = re.sub('^( texte: )(.*)$', title_replace, s, flags=re.MULTILINE)
return s
def force_encode(line, iline, codecs=('cp1252', 'utf8')):
for i in codecs:
try:
return line.decode('utf-8').encode(i).decode()
except (UnicodeDecodeError, UnicodeEncodeError) as e:
print(f"Line = {iline} Error = {e}")
def clean_yaml(yml_filename):
"""read a filename.yml and write filename_clean.yml"""
base_filename = os.path.splitext(yml_filename)[0]
output_filename = base_filename + "_clean.yml"
print(f">>> Cleaning {yml_filename} -> {output_filename}")
yaml = YAML(typ='safe')
# Load original yaml as a list
with open(yml_filename, mode='r') as yml_file:
s = remove_null_date(clean_titles(strip_invalid(yml_file.read())))
yml_list = yaml.load(s)
# Dump to file
yml_fn_tmp = base_filename + "_tmp.yml"
with open(yml_fn_tmp, mode='w') as yml_file_tmp:
yaml.dump(yml_list, yml_file_tmp)
# Reopen file to enforce encoding
with open(yml_fn_tmp, mode='rb') as yml_file_tmp:
iline = 0
yml_clean = ""
for line in yml_file_tmp:
iline += 1
newline = force_encode(line, iline)
yml_clean += newline
os.remove(yml_fn_tmp)
# Load the clean yaml content
yml_list_new = yaml.load(yml_clean)
# Dump clean yaml to file
with open(base_filename + "_clean.yml", mode='w') as yml_file_clean:
yaml.dump(yml_list_new, yml_file_clean)
if __name__ == '__main__':
clean_yaml("spip_auteurs.yml")
clean_yaml("spip_articles.yml")
clean_yaml("spip_rubriques.yml")
clean_yaml("spip_breves.yml")
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment