Fixing encoding

parent 59dfcf39
......@@ -86,7 +86,12 @@ with open("./spip_yml/spip_articles.yml", 'r') as stream:
regex = re.compile(f'({r}.*)')
lines = regex.sub(remove, lines)
lines = lines.encode('iso-8859-1', 'utf8').decode('utf-8', 'mixed')
#lines = lines.encode('iso-8859-1', 'utf8').decode('utf-8', 'mixed')
import ftfy
lines = ftfy.fix_encoding(lines)
with open("tmp.yml", "w") as fout:
fout.write(lines)
y = yaml.load(clean_titles(strip_invalid(lines)))
......
......@@ -13,17 +13,43 @@ import os
import re
from ruamel.yaml import YAML
from ruamel.yaml.reader import Reader
import io
import shutil
import sys
import ftfy # Fixing broken encoding
import chardet
SPIPFILES = [
"spip_auteurs.yml",
"spip_auteurs_liens.yml",
"spip_articles.yml",
"spip_breves.yml",
"spip_documents.yml",
"spip_rubriques.yml"
]
SPIPFILES = "spip_auteurs.yml", \
"spip_auteurs_liens.yml", \
"spip_articles.yml", \
"spip_breves.yml", \
"spip_documents.yml", \
"spip_rubriques.yml"
OUTPUTDIR = "spip_yml"
TITLETOREMOVE = frozenset([
'surtitre',
'soustitre',
'chapo',
'maj',
'export',
'visites',
'referers',
'popularite',
'accepter_forum',
'date_modif',
'langue_choisie',
'id_trad',
'id_version',
'nom_site',
'url_site',
'virtuel',
'date_redac'
])
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
......@@ -45,12 +71,57 @@ def strip_invalid(s):
res = ''
for x in s:
if Reader.NON_PRINTABLE.match(x):
#res += '\\x{:x}'.format(ord(x))
continue
res += x
return res
def escape_title(s):
"""
Add simple quotes to text titles to avoid interpretation of ":" as yaml syntax
"""
match = escape_title.pattern.match(s)
if match and match.group(1) in ["titre", "nom_site", "texte"] and not match.group(2).startswith("|-"):
return " {}: '{}'\n".format(match.group(1), match.group(2).replace("'", "''"))
else:
return s
escape_title.pattern = re.compile(r'^ ([^ :]+):\s+(.*?)$')
def fix_encoding(s):
"""
Fix wrong encoded utf-8 lines.
It is sometimes not consistent: it can fix encoding in a subpart of a
sentence but not of the whole sentence.
"""
return ftfy.fix_text(
s,
remove_terminal_escapes=True,
fix_encoding=True,
fix_entities=True,
uncurl_quotes=False,
fix_latin_ligatures=True,
fix_character_width=True,
fix_line_breaks=False,
fix_surrogates=True,
remove_control_chars=True,
remove_bom=True,
normalization='NFC'
)
def is_invalid_date(s):
"""
True if title line contains a null date.
"""
return is_invalid_date.pattern.match(s)
is_invalid_date.pattern = re.compile(r"^ (date|en_ligne)[^:]*: 0000-00-00 00:00:00$")
def remove_null_date(s):
"""
Remove "date:", "date_tmp:", etc. if equals to 0000-00-00 00:00:00 (otherwise yaml.load() would fail)
......@@ -58,71 +129,43 @@ def remove_null_date(s):
s = re.sub(r'(^ date|^ en_ligne).*: 0000-00-00 00:00:00$', r'', s, flags=re.MULTILINE)
return s
def clean_titles(s):
def is_removable_title(s):
"""
Add simple quotes to titles to avoid interpretation of ":" as yaml syntax
True if the title can be remove from the YAML content.
"""
def title_replace(matchobj):
"""Return a clean title line"""
title_type = matchobj.group(1)
title = matchobj.group(2)
if title.startswith("|-"):
# Do not replace if multiline content
return matchobj.group(0)
else:
# Avoid real text simple quote to be interpreted as end of string
title = re.sub(r"'", "''", title)
return f"{title_type}'{title}'"
match = is_removable_title.pattern.match(s)
return match and match.group(1) in TITLETOREMOVE
is_removable_title.pattern = re.compile(r'^ ([^ :]+):')
s = re.sub('^( titre: )(.*)$', title_replace, s, flags=re.MULTILINE)
s = re.sub('^( nom_site: )(.*)$', title_replace, s, flags=re.MULTILINE)
s = re.sub('^( texte: )(.*)$', title_replace, s, flags=re.MULTILINE)
return s
def clean_generator(iterable):
"""
Generator of cleaned and fixed YAML lines.
"""
#for line in deblock_generator(iterable):
for line in iterable:
if is_invalid_date(line) or is_removable_title(line):
continue
def force_encode(line, iline, codecs=('cp1252', 'utf8')):
for i in codecs:
try:
return line.decode('utf-8').encode(i).decode()
except (UnicodeDecodeError, UnicodeEncodeError) as e:
logger.debug(f"Line = {iline} Error = {e}")
# Unicode "LINE SEPARATOR" (U+2028) generates error while parsing YAML
yield escape_title(fix_encoding(line).replace("\u2028", ''))
def clean_yaml(yml_filename):
"""read a filename.yml and write filename_clean.yml"""
"""
Read a given file and write a cleaned version with _clean suffix.
"""
base_filename = os.path.splitext(yml_filename)[0]
output_filename = base_filename + "_clean.yml"
logger.info(f">>> Cleaning {yml_filename} -> {output_filename}")
yaml = YAML(typ='safe')
# Load original yaml as a list
with open(yml_filename, mode='r') as yml_file:
s = remove_null_date(clean_titles(strip_invalid(yml_file.read())))
yml_list = yaml.load(s)
# Dump to file
yml_fn_tmp = base_filename + "_tmp.yml"
with open(yml_fn_tmp, mode='w') as yml_file_tmp:
yaml.dump(yml_list, yml_file_tmp)
# Reopen file to enforce encoding
with open(yml_fn_tmp, mode='rb') as yml_file_tmp:
iline = 0
yml_clean = ""
for line in yml_file_tmp:
iline += 1
newline = force_encode(line, iline)
yml_clean += newline
os.remove(yml_fn_tmp)
# Load the clean yaml content
yml_list_new = yaml.load(yml_clean)
# Dump clean yaml to file
with open(base_filename + "_clean.yml", mode='w') as yml_file_clean:
yaml.dump(yml_list_new, yml_file_clean)
with open(yml_filename, mode='r') as yml_in:
with open(output_filename, mode='w') as yml_out:
for line in clean_generator(yml_in):
yml_out.write(line)
def reset_output_directory():
......@@ -172,3 +215,4 @@ if __name__ == '__main__':
reset_output_directory()
split_yaml(args.spipfile)
clean_yaml_files(SPIPFILES)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment