Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
groupe-calcul
spip2pelican
Commits
a0147a2a
Commit
a0147a2a
authored
Jul 19, 2018
by
gouarin
Browse files
add rst generator
parent
18c1b8d1
Changes
2
Hide whitespace changes
Inline
Side-by-side
create_spip_article_clean.py
0 → 100644
View file @
a0147a2a
from
ruamel.yaml
import
YAML
from
ruamel.yaml.reader
import
Reader
import
re
to_remove
=
[
'surtitre:'
,
'soustitre:'
,
'chapo:'
,
'maj:'
,
'export:'
,
'visites:'
,
'referers:'
,
'popularite:'
,
'accepter_forum:'
,
'date_modif:'
,
'langue_choisie:'
,
'id_trad:'
,
'id_version:'
,
'nom_site:'
,
'url_site:'
,
'virtuel:'
,
'date_redac:'
]
def
strip_invalid
(
s
):
res
=
''
for
x
in
s
:
if
Reader
.
NON_PRINTABLE
.
match
(
x
):
# res += '\\x{:x}'.format(ord(x))
continue
res
+=
x
return
res
def
clean_titles
(
s
):
"""
Add simple quotes to titles to avoid interpretation of ":" as yaml syntax
"""
def
title_replace
(
matchobj
):
"""Return a clean title line"""
title_type
=
matchobj
.
group
(
1
)
title
=
matchobj
.
group
(
2
)
if
title
.
startswith
(
"|-"
):
# Do not replace if multiline content
return
matchobj
.
group
(
0
)
else
:
# Avoid real text simple quote to be interpreted as end of string
title
=
re
.
sub
(
r
"'"
,
"''"
,
title
)
return
f
"
{
title_type
}
'
{
title
}
'"
s
=
re
.
sub
(
'^( titre: )(.*)$'
,
title_replace
,
s
,
flags
=
re
.
MULTILINE
)
s
=
re
.
sub
(
'^( nom_site: )(.*)$'
,
title_replace
,
s
,
flags
=
re
.
MULTILINE
)
s
=
re
.
sub
(
'^( texte: )(.*)$'
,
title_replace
,
s
,
flags
=
re
.
MULTILINE
)
return
s
yaml
=
YAML
()
import
codecs
last_position
=
-
1
def
mixed_decoder
(
unicode_error
):
global
last_position
string
=
unicode_error
.
object
position
=
unicode_error
.
start
new
=
string
.
decode
(
"iso-8859-1"
)
return
new
[
position
:
unicode_error
.
end
],
unicode_error
.
end
def
mixed_decoder_utf8
(
unicode_error
):
global
last_position
string
=
unicode_error
.
object
position
=
unicode_error
.
start
new
=
string
[
position
:
unicode_error
.
end
].
encode
(
"utf-8"
)
return
new
,
unicode_error
.
end
codecs
.
register_error
(
"mixed"
,
mixed_decoder
)
codecs
.
register_error
(
"utf8"
,
mixed_decoder_utf8
)
with
open
(
"./spip_yml/spip_articles.yml"
,
'r'
)
as
stream
:
lines
=
stream
.
read
()
def
remove
(
match
):
return
''
for
r
in
to_remove
:
regex
=
re
.
compile
(
f
'(
{
r
}
.*)'
)
lines
=
regex
.
sub
(
remove
,
lines
)
lines
=
lines
.
encode
(
'iso-8859-1'
,
'utf8'
).
decode
(
'utf-8'
,
'mixed'
)
y
=
yaml
.
load
(
clean_titles
(
strip_invalid
(
lines
)))
with
open
(
"./spip_yml/spip_articles_clean.yml"
,
'w'
)
as
stream
:
yaml
.
dump
(
y
,
stream
)
spip2pelican.py
View file @
a0147a2a
...
...
@@ -9,6 +9,7 @@ import anytree
import
argparse
from
ruamel.yaml
import
YAML
import
bs4
from
bs4
import
BeautifulSoup
from
colorlog
import
ColoredFormatter
import
ftfy
import
logging
...
...
@@ -299,17 +300,329 @@ class SpipToMarkdown:
class
SpipToRst
(
SpipToMarkdown
):
"""A class to export spip article format to a ReStructuredText Pelican article"""
def
convert
(
self
,
s
,
preserve_line_breaks
=
False
):
"""Apply a pandoc conversion to markdown format"""
s
=
super
().
convert
(
s
)
if
preserve_line_breaks
:
extra_args
=
[
'--wrap=preserve'
]
else
:
extra_args
=
[
'--wrap=auto'
]
s
=
pypandoc
.
convert_text
(
s
,
'rst'
,
format
=
'md'
,
extra_args
=
extra_args
)
s
=
re
.
sub
(
r
"%7Bfilename%7D"
,
r
"{filename}"
,
s
)
# Correct unwanted pandoc translation
def
__init__
(
self
,
website
):
self
.
website
=
website
def
convert
(
self
,
s
):
"""Convert string from Spip format to Pelican markdown format"""
# # s = self.html_link(s)
# # s = self.html_img(s)
s
=
self
.
ordered_list
(
s
)
s
=
self
.
unordered_list
(
s
)
# s = self.horizontal_rule(s)
s
=
self
.
fix_li
(
s
)
s
=
self
.
convert_html
(
s
)
s
=
self
.
remove_font
(
s
)
s
=
self
.
bold
(
s
)
s
=
self
.
italic
(
s
)
s
=
self
.
link
(
s
)
s
=
self
.
remove_space
(
s
)
s
=
self
.
remove_empty_link
(
s
)
s
=
self
.
document
(
s
)
s
=
self
.
fix_table
(
s
)
s
=
self
.
remove_blank
(
s
)
s
=
self
.
header
(
s
)
s
=
self
.
header_extended
(
s
)
return
s
def
document
(
self
,
s
):
"""
SPIP: <doc|path> or <img|path>
md: [text](url) or 
"""
def
doc_rst
(
match
):
doc_type
=
match
[
1
]
doc_id
=
int
(
match
[
2
])
url
=
os
.
path
.
join
(
self
.
website
.
attachments_prefix
,
"IMG"
,
self
.
website
.
doc_index
[
doc_id
])
print
(
self
.
website
.
doc_index
[
doc_id
])
docname
=
os
.
path
.
basename
(
url
)
if
doc_type
==
'doc'
:
return
f
'`
{
docname
}
<
{
url
}
>`__'
else
:
return
f
'
\n\n
..image::
{
url
}
\n\n
'
regex
=
re
.
compile
(
r
'<(doc|img)([0-9]+)\|.*>'
)
return
regex
.
sub
(
doc_rst
,
s
)
def
html_link
(
self
,
s
):
"""Replace html href by the right Pelican (relative) URL"""
def
link_replace
(
matchobj
):
"""A call back function to replace a Spip absolute link by a relative link to Pelican file"""
spip_type
=
matchobj
.
group
(
1
)
id_art
=
int
(
matchobj
.
group
(
2
))
anchorobj
=
re
.
match
(
r
"#(.*)"
,
matchobj
.
group
(
3
))
if
anchorobj
:
new_url
=
anchorobj
.
group
(
0
)
else
:
new_url
=
f
"spip_
{
spip_type
}
-
{
id_art
}
.html"
return
new_url
def
link_replace_doc
(
matchobj
):
"""Prepend attachment document path with attachment prefix"""
return
self
.
website
.
attachments_prefix
+
matchobj
.
group
(
2
)
soup
=
bs4
.
BeautifulSoup
(
s
,
"html.parser"
)
for
link
in
soup
.
find_all
(
'a'
):
link_url
=
link
.
get
(
'href'
)
if
link_url
:
new_url
=
re
.
sub
(
r
"\A{}/spip.php\?(article|rubrique)([0-9]+)(.*)"
.
format
(
self
.
website
.
site_url
),
link_replace
,
link_url
)
new_url
=
re
.
sub
(
r
"\A({}/|)(Documents/.*)"
.
format
(
self
.
website
.
site_url
),
link_replace_doc
,
new_url
)
link
[
'href'
]
=
new_url
return
soup
.
prettify
(
formatter
=
None
)
# formatter=None to avoid ">" -> ">" conversion
def
html_img
(
self
,
s
):
"""Replace html img src by the right Pelican (relative) URL"""
def
src_replace
(
matchobj
):
"""Prepend attachment image path with attachment prefix"""
return
self
.
website
.
attachments_prefix
+
matchobj
.
group
(
0
)
soup
=
bs4
.
BeautifulSoup
(
s
,
"html.parser"
)
for
img
in
soup
.
find_all
(
'img'
):
img_src
=
img
.
get
(
'src'
)
if
img_src
:
new_src
=
re
.
sub
(
r
"\ADocuments/.*"
,
src_replace
,
img_src
)
img
[
'src'
]
=
new_src
return
soup
.
prettify
(
formatter
=
None
)
# formatter=None to avoid ">" -> ">" conversion
def
fix_table
(
self
,
s
):
def
remove_bad_char
(
match
):
return
'| |'
regex
=
re
.
compile
(
'\|(\^|<)\|'
)
s
=
regex
.
sub
(
remove_bad_char
,
s
)
return
re
.
sub
(
r
'\|'
,
''
,
s
)
def
fix_li
(
self
,
s
):
soup
=
BeautifulSoup
(
s
,
'html.parser'
)
for
li
in
soup
.
find_all
(
'li'
):
if
isinstance
(
li
.
contents
[
0
],
str
):
text
=
li
.
contents
[
0
].
replace
(
'
\n
'
,
''
)
li
.
replace_with
(
text
)
return
soup
.
prettify
(
formatter
=
None
)
def
remove_space
(
self
,
s
):
new
=
[]
for
l
in
s
.
split
(
"
\n
"
):
new
.
append
(
l
.
strip
())
return
'
\n
'
.
join
(
new
)
def
remove_blank
(
self
,
s
):
new
=
[]
for
l
in
s
.
split
(
"
\n
"
):
if
l
.
lstrip
().
startswith
(
'-'
):
new
.
append
(
l
+
'
\n
'
)
else
:
new
.
append
(
l
.
lstrip
())
return
'
\n
'
.
join
(
new
)
def
convert_html
(
self
,
lines
):
soup
=
BeautifulSoup
(
lines
,
'html.parser'
)
for
html
in
soup
.
find_all
(
'ul'
):
s
=
pypandoc
.
convert_text
(
html
,
'rst'
,
format
=
'html'
,
extra_args
=
[
'--wrap=preserve'
])
html
.
replace_with
(
s
)
for
html
in
soup
.
find_all
(
'a'
):
s
=
pypandoc
.
convert_text
(
html
,
'rst'
,
format
=
'html'
,
extra_args
=
[
'--wrap=preserve'
])
html
.
replace_with
(
s
)
return
soup
.
prettify
(
formatter
=
None
)
@
staticmethod
def
remove_font
(
s
):
def
font_rst
(
match
):
return
' '
regex
=
re
.
compile
(
r
'(<font .*>)'
)
s
=
regex
.
sub
(
font_rst
,
s
)
regex
=
re
.
compile
(
r
'(</font>)'
)
s
=
regex
.
sub
(
font_rst
,
s
)
regex
=
re
.
compile
(
r
'(<html>)'
)
s
=
regex
.
sub
(
font_rst
,
s
)
regex
=
re
.
compile
(
r
'(</html>)'
)
s
=
regex
.
sub
(
font_rst
,
s
)
regex
=
re
.
compile
(
r
'(<hr/>)'
)
return
regex
.
sub
(
font_rst
,
s
)
@
staticmethod
def
bold
(
s
):
"""
SPIP: {{ ... }}
md: **...**
"""
def
bold_rst
(
match
):
text
=
match
[
2
].
strip
()
return
f
'**
{
text
}
** '
regex
=
re
.
compile
(
r
'({{2})([^}]+)(}{2})'
)
new
=
[]
for
l
in
s
.
split
(
"
\n
"
):
new
.
append
(
regex
.
sub
(
bold_rst
,
l
))
return
'
\n
'
.
join
(
new
)
@
staticmethod
def
italic
(
s
):
"""
SPIP: {...}
md: *...*
"""
def
italic_rst
(
match
):
text
=
match
[
2
].
strip
()
return
f
'*
{
text
}
* '
regex
=
re
.
compile
(
r
'({)([^}]*)(})'
)
new
=
[]
for
l
in
s
.
split
(
"
\n
"
):
new
.
append
(
regex
.
sub
(
italic_rst
,
l
))
return
'
\n
'
.
join
(
new
)
@
staticmethod
def
ordered_list
(
s
):
"""
SPIP: - or -# in 1rst level, -## for second level, etc.
md: 1. with 4-space indents
"""
def
ordered_rst
(
match
):
indent
=
' '
*
4
*
(
match
[
1
].
count
(
'*'
)
-
1
)
return
f
'
\n
{
indent
}
-
{
match
[
2
]
}
\n
'
regex
=
re
.
compile
(
r
'^\s*-\s*(\#*)(.*)'
)
new
=
[]
for
l
in
s
.
split
(
"
\n
"
):
new
.
append
(
regex
.
sub
(
ordered_rst
,
l
))
return
'
\n
'
.
join
(
new
)
@
staticmethod
def
remove_empty_link
(
s
):
def
replace
(
match
):
return
f
'
{
match
[
1
]
}
'
regex
=
re
.
compile
(
r
'`(.*)<>`__'
)
return
regex
.
sub
(
replace
,
s
)
@
staticmethod
def
unordered_list
(
s
):
"""
SPIP: - or -* in 1rst level, -** for second level, etc.
md: - with 4-space indents
"""
def
unordered_rst
(
match
):
indent
=
' '
*
4
*
(
match
[
1
].
count
(
'*'
)
-
1
)
text
=
match
[
2
].
strip
()
return
f
'
\n
{
indent
}
-
{
text
}
\n
'
regex
=
re
.
compile
(
'^\s*-\s*(\**)(.*)'
)
new
=
[]
for
l
in
s
.
split
(
"
\n
"
):
new
.
append
(
regex
.
sub
(
unordered_rst
,
l
))
return
'
\n
'
.
join
(
new
)
@
staticmethod
def
header
(
s
):
"""
SPIP: {{{...}}}
md: ## ...
"""
def
header_rst
(
match
):
text
=
match
[
2
].
strip
()
return
text
+
'
\n
'
+
'='
*
len
(
text
)
+
'
\n
'
regex
=
re
.
compile
(
r
'({{3})([^}]*)(}{3})'
)
return
regex
.
sub
(
header_rst
,
s
)
@
staticmethod
def
header_extended
(
s
):
"""
SPIP: {{{{{...}}}}}
md: ### ...
"""
def
header_rst
(
match
):
text
=
match
[
2
].
strip
()
return
text
+
'
\n
'
+
'-'
*
len
(
text
)
+
'
\n
'
regex
=
re
.
compile
(
r
'({{5})([^}]*)(}{5})'
)
return
regex
.
sub
(
header_rst
,
s
)
@
staticmethod
def
horizontal_rule
(
s
):
"""
SPIP: ---- with no carriage return before and after
md:
\n
---
\n
"""
s
=
re
.
sub
(
r
"<hr>"
,
r
"\n---\n"
,
s
)
s
=
re
.
sub
(
r
"----"
,
r
"\n---\n"
,
s
)
return
s
def
link
(
self
,
s
):
"""
SPIP: [text->url] or [text -> url]
md: [text](url) or <url> if text is empty
"""
def
nullify_url
(
id_art
,
text
,
url
):
"""Throw WARNING message and return empty URL"""
msg
=
f
" WARNING: nullify link to non existing article
{
id_art
}
\n
"
msg
+=
f
" text:
{
text
}
\n
"
msg
+=
f
" url:
{
url
}
"
logger
.
warning
(
msg
)
self
.
website
.
nullified_urls
+=
1
return
""
def
link_rst
(
match
):
text
=
match
[
1
]
link
=
match
[
2
].
strip
()
if
text
==
''
or
text
==
link
:
return
f
'
{
link
}
'
email
=
re
.
match
(
'mailto:(.*)'
,
link
)
if
email
:
return
f
'
{
email
.
group
(
1
).
strip
()
}
'
http_url
=
re
.
match
(
r
'http'
,
link
)
if
http_url
:
return
f
'`
{
text
}
<
{
link
}
>`__'
doc_url
=
re
.
match
(
r
'doc([0-9]+)'
,
link
)
if
doc_url
:
link
=
os
.
path
.
join
(
self
.
website
.
attachments_prefix
,
"IMG"
,
self
.
website
.
doc_index
[
int
(
doc_url
.
group
(
1
))])
return
f
'`
{
text
}
<
{
link
}
>`__'
art_url
=
re
.
match
(
r
"(art|rub|brev)([0-9]+)"
,
link
)
if
art_url
:
art_id
=
art_url
.
group
(
0
)
try
:
link
=
self
.
website
.
article_index
[
art_id
]
except
KeyError
:
link
=
nullify_url
(
art_id
,
text
,
link
)
return
f
'`
{
text
}
<
{
link
}
>`__'
link
=
os
.
path
.
join
(
self
.
website
.
attachments_prefix
,
link
)
return
f
'`
{
text
}
<
{
link
}
>`__'
regex
=
re
.
compile
(
r
'\[([^]]*)\s*-\s*>\s*([^]]*)\]'
)
return
regex
.
sub
(
link_rst
,
s
)
class
Article
:
"""A generic class for a single Spip article or rubrique to be converted into a Pelican article file"""
...
...
@@ -319,6 +632,7 @@ class Article:
self
.
type
=
spip_type
self
.
website
=
website
id_tag
=
'id_'
+
self
.
type
print
(
spip_article
)
self
.
short_id
=
spip_article
[
id_tag
]
self
.
id
=
f
"
{
SHORTEN
[
self
.
type
]
}{
self
.
short_id
}
"
self
.
title
=
spip_article
[
'titre'
]
...
...
@@ -451,13 +765,14 @@ class ArticleRst(Article):
def
convert_title
(
self
,
title
):
"""Prevent line breaks when converting title"""
return
self
.
convert
(
title
,
preserve_line_breaks
=
True
).
strip
()
return
self
.
convert
(
title
).
strip
()
def
get_header
(
self
):
"""Return header in rst format"""
print
(
self
.
title
)
title
=
f
"
{
self
.
title
}
\n
{
'#'
*
len
(
self
.
title
)
}
\n\n
"
header
=
title
+
f
"""
\
#title = f"{self.title}\n{'#'*len(self.title)}\n\n"
header
=
f
"""
\
:title:
{
self
.
title
}
:date:
{
self
.
date
}
:modified:
{
self
.
modified
}
:category:
{
self
.
category
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment