Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
groupe-calcul
spip2pelican
Commits
9818f697
Commit
9818f697
authored
Jun 02, 2018
by
Matthieu Boileau
Browse files
Read parameters from config.yml (Fix
#11
)
parent
60edf76e
Changes
3
Hide whitespace changes
Inline
Side-by-side
README.md
View file @
9818f697
...
...
@@ -33,6 +33,9 @@ Ce script produit le fichier `spip_articles_clean.yml`.
### 4. Convertir les articles spip en articles pelican
-
Editer le fichier
`config.yml`
-
Lancer la conversion:
```
python3 spip2pelican.py
```
...
...
config.yml
0 → 100644
View file @
9818f697
site_url
:
http://calcul.math.cnrs.fr
authors
:
Groupe Calcul
attachments_prefix
:
attachments/spip/
categories
:
journees
:
4
formations
:
39
paysage
:
-
6
-
10
-
14
groupe
:
2
skip
:
49
rubriques_filename
:
spip_rubriques_clean.yml
articles_filename
:
spip_articles_clean.yml
documents_filename
:
spip_documents.yml
spip2pelican.py
View file @
9818f697
...
...
@@ -16,12 +16,6 @@ import sys
from
yaml_cleaner
import
strip_invalid
,
remove_null_date
SITE_URL
=
"http://calcul.math.cnrs.fr"
AUTHORS
=
"Groupe Calcul"
CATEGORIES
=
{
4
:
"journees"
,
39
:
"formations"
,
6
:
"paysage"
,
10
:
"paysage"
,
14
:
"paysage"
,
2
:
"groupe"
,
49
:
"skip"
,
-
1
:
"spip_divers"
}
ATTACHMENTS_PREFIX
=
"attachments/spip/"
yaml
=
YAML
(
typ
=
'safe'
)
SKIP_REASON
=
{
"skip_rub"
:
"belonging to a skipped rubrique"
,
...
...
@@ -73,7 +67,7 @@ def bold(s):
return
re
.
sub
(
r
"(^|[^{]){{([^}]+)}}([^}]|$)"
,
r
"\1**\2**\3"
,
s
)
def
html_link
s
(
s
):
def
html_link
(
s
,
website
):
"""Replace html href by the right Pelican (relative) URL"""
def
link_replace
(
matchobj
):
...
...
@@ -90,25 +84,26 @@ def html_links(s):
def
link_replace_doc
(
matchobj
):
"""Prepend attachment document path with attachment prefix"""
return
ATTACHMENTS_PREFIX
+
matchobj
.
group
(
2
)
return
website
.
attachments_prefix
+
matchobj
.
group
(
2
)
soup
=
bs4
.
BeautifulSoup
(
s
,
"html.parser"
)
for
link
in
soup
.
find_all
(
'a'
):
link_url
=
link
.
get
(
'href'
)
if
link_url
:
new_url
=
re
.
sub
(
r
"\A{}/spip.php\?(article|rubrique)([0-9]+)(.*)"
.
format
(
SITE_URL
),
link_replace
,
link_url
)
new_url
=
re
.
sub
(
r
"\A({}/|)(Documents/.*)"
.
format
(
SITE_URL
),
link_replace_doc
,
new_url
)
new_url
=
re
.
sub
(
r
"\A{}/spip.php\?(article|rubrique)([0-9]+)(.*)"
.
format
(
website
.
site_url
),
link_replace
,
link_url
)
new_url
=
re
.
sub
(
r
"\A({}/|)(Documents/.*)"
.
format
(
website
.
site_url
),
link_replace_doc
,
new_url
)
link
[
'href'
]
=
new_url
return
soup
.
prettify
(
formatter
=
None
)
# formatter=None to avoid ">" -> ">" conversion
def
html_img
(
s
):
def
html_img
(
s
,
website
):
"""Replace html img src by the right Pelican (relative) URL"""
def
src_replace
(
matchobj
):
"""Prepend attachment image path with attachment prefix"""
return
ATTACHMENTS_PREFIX
+
matchobj
.
group
(
0
)
return
website
.
attachments_prefix
+
matchobj
.
group
(
0
)
soup
=
bs4
.
BeautifulSoup
(
s
,
"html.parser"
)
for
img
in
soup
.
find_all
(
'img'
):
...
...
@@ -156,7 +151,7 @@ def link(s, website):
doc_url
=
re
.
match
(
r
"\Adoc([0-9]+)"
,
url
)
if
doc_url
:
# [text->doc#]
new_url
=
os
.
path
.
join
(
ATTACHMENTS_PREFIX
,
"IMG"
,
website
.
doc_index
[
int
(
doc_url
.
group
(
1
))])
new_url
=
os
.
path
.
join
(
website
.
attachments_prefix
,
"IMG"
,
website
.
doc_index
[
int
(
doc_url
.
group
(
1
))])
else
:
art_url
=
re
.
match
(
r
"\Aart([0-9]+)"
,
url
)
if
art_url
:
...
...
@@ -178,7 +173,7 @@ def link(s, website):
new_url
=
os
.
path
.
join
(
f
"
{
category
}
.html"
)
else
:
# [text->path_to_file]
new_url
=
os
.
path
.
join
(
ATTACHMENTS_PREFIX
,
url
)
new_url
=
os
.
path
.
join
(
website
.
attachments_prefix
,
url
)
new_link
=
f
"[
{
text
}
](
{
new_url
}
)"
return
new_link
...
...
@@ -197,7 +192,7 @@ def document(s, website):
"""A call back function to replace a Spip doc by a Pelican link"""
doc_type
=
matchobj
.
group
(
1
)
doc_id
=
int
(
matchobj
.
group
(
2
))
url
=
os
.
path
.
join
(
ATTACHMENTS_PREFIX
,
"IMG"
,
website
.
doc_index
[
doc_id
])
url
=
os
.
path
.
join
(
website
.
attachments_prefix
,
"IMG"
,
website
.
doc_index
[
doc_id
])
if
doc_type
==
'doc'
:
return
f
"[Document](
{
url
}
)"
else
:
...
...
@@ -252,8 +247,8 @@ def horizontal_rule(s):
def
spip_to_markdown
(
s
,
website
):
"""Convert string from Spip format to Pelican markdown format"""
s
=
html_link
s
(
s
)
s
=
html_img
(
s
)
s
=
html_link
(
s
,
website
)
s
=
html_img
(
s
,
website
)
s
=
italic
(
s
)
s
=
bold
(
s
)
s
=
ordered_list
(
s
)
...
...
@@ -324,7 +319,7 @@ class Article:
self
.
title
=
spip_to_markdown
(
self
.
title
,
self
.
website
).
strip
()
# strip to remove any CR at end of string
tags
=
[]
authors
=
AUTHORS
authors
=
self
.
website
.
authors
content
=
spip_to_markdown
(
self
.
text
,
self
.
website
)
header
=
f
"""
\
title:
{
self
.
title
}
...
...
@@ -347,38 +342,59 @@ summary: {self.summary}
return
self
.
skip_reason
def
reset_output_directories
():
"""Erase existing output files and create empty output directories"""
if
os
.
path
.
exists
(
"content"
):
shutil
.
rmtree
(
"content"
)
for
category
in
set
(
CATEGORIES
.
values
()):
if
category
!=
'skip'
:
os
.
makedirs
(
os
.
path
.
join
(
"content"
,
category
))
class
Website
:
"""Define a website from Spip data"""
def
__init__
(
self
,
rubriques_filename
,
documents_filename
,
articles_filename
):
self
.
rubriques_filename
=
rubriques_filename
self
.
documents_filename
=
documents_filename
self
.
articles_filename
=
articles_filename
def
__init__
(
self
,
reset_output_dir
=
True
):
self
.
category_index
=
{}
self
.
article_index
=
{}
self
.
doc_index
=
{}
config_filename
=
"config.yml"
with
open
(
config_filename
,
'r'
)
as
ymlfile
:
cfg
=
yaml
.
load
(
ymlfile
)
self
.
site_url
=
cfg
[
'site_url'
]
self
.
authors
=
cfg
[
'authors'
]
self
.
attachments_prefix
=
cfg
[
'attachments_prefix'
]
self
.
rubriques_filename
=
cfg
[
'rubriques_filename'
]
self
.
documents_filename
=
cfg
[
'documents_filename'
]
self
.
articles_filename
=
cfg
[
'articles_filename'
]
self
.
categories
=
{
-
1
:
"spip_divers"
}
for
pelican_category
,
spip_rubrique
in
cfg
[
'categories'
].
items
():
if
type
(
spip_rubrique
)
==
int
:
# this pelican category corresponds to a single rubrique
self
.
categories
[
spip_rubrique
]
=
pelican_category
elif
type
(
spip_rubrique
)
==
list
:
# this pelican category corresponds to a list of rubriques
for
rubrique
in
spip_rubrique
:
self
.
categories
[
rubrique
]
=
pelican_category
else
:
logger
.
critical
(
f
"Error in
{
config_filename
}
:
{
pelican_category
}
:
{
spip_rubrique
}
"
)
if
reset_output_dir
:
self
.
reset_output_directories
()
def
reset_output_directories
(
self
):
"""Erase existing output files and create empty output directories"""
if
os
.
path
.
exists
(
"content"
):
shutil
.
rmtree
(
"content"
)
for
category
in
set
(
self
.
categories
.
values
()):
if
category
!=
'skip'
:
os
.
makedirs
(
os
.
path
.
join
(
"content"
,
category
))
def
_build_category_index
(
self
):
"""Build the index dictionary: {id_rubrique: category_name}"""
def
get_category
(
id_rubrique
):
"""Return category from id_rubrique"""
while
id_rubrique
not
in
CATEGORIES
:
while
id_rubrique
not
in
self
.
categories
:
try
:
id_rubrique
=
parents
[
id_rubrique
]
except
KeyError
:
id_rubrique
=
-
1
return
CATEGORIES
[
id_rubrique
]
return
self
.
categories
[
id_rubrique
]
# Load original rubriques file as a list
with
open
(
self
.
rubriques_filename
,
mode
=
'r'
)
as
yml_rubriques
:
...
...
@@ -444,7 +460,6 @@ class Website:
if
__name__
==
'__main__'
:
reset_output_directories
()
website
=
Website
(
"spip_rubriques_clean.yml"
,
"spip_documents.yml"
,
"spip_articles_clean.yml"
)
website
=
Website
()
website
.
read_spip
()
website
.
export_to_pelican
()
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment