Commit 2214289d authored by Matthieu Boileau's avatar Matthieu Boileau
Browse files

#208 Add a script to gather job offers data in a pandas dataframe

parent 3ff68fdb
Pipeline #5018 passed with stages
in 3 minutes and 7 seconds
......@@ -15,3 +15,4 @@ dependencies:
- urllib3==1.24.2
- requests
- pelican
- pandas
#!/usr/bin/env python3
"""Compute stat for job offers"""
from git import Repo
import os
from pathlib import PurePath
import html
import pandas as pd
import re
def parse_file_content(s):
"""Parse markdown file and return metadata as a dict"""
def get_metadata(data_name):
"""Return metadata from markdown file"""
expression = "^{}: (.*)$".format(data_name)
m =, s, re.MULTILINE)
if m:
return html.unescape(
return ''
job = {}
job['Type'] = get_metadata('Job_Type')
job['Duration'] = get_metadata('Job_Duration')
job['Location'] = get_metadata('Job_Location')
job['Employer'] = get_metadata('Job_Employer')
job['Title'] = get_metadata('Title')
job['Date'] = get_metadata('Date')
return job
def process_git_repo():
"""Parse git repository and return job data as a Pandas dataframe"""
this_script_path = PurePath(os.path.realpath(__file__))
git_repo_path = this_script_path.parents[1]
repo = Repo(git_repo_path)
commits = repo.iter_commits('--all')
jobs = [commit for commit in commits if commit.summary.startswith('Adding new job offer')]
df = pd.DataFrame(columns=['Date', 'Type', 'Location', 'Duration', 'Employer', 'Title'])
for job in jobs:
# Get markdown file path from commit data
file_paths = job.stats.files.keys()
job_file_path = ''
for file_path in file_paths:
if file_path.endswith('.md'):
job_file_path = file_path
if job_file_path:
# Get file content as a string
file_content ='{}:{}'.format(job.hexsha, job_file_path))
# Add a row in pandas dataframe from markdown file metadata
df = df.append(parse_file_content(file_content), ignore_index=True)
# Convert to time dataframe
df['datetime'] = pd.to_datetime(df['Date'])
df = df.set_index('datetime')
df.drop(['Date'], axis=1, inplace=True)
return df
if __name__ == '__main__':
df = process_git_repo()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment