script de stats
This commit is contained in:
parent
6d77de4696
commit
759f30f628
127
atom_generate.py
127
atom_generate.py
@ -1,111 +1,56 @@
|
||||
import os
|
||||
import re
|
||||
import json
|
||||
from datetime import datetime
|
||||
|
||||
# Chemin du dossier source
|
||||
import argparse
|
||||
|
||||
from utils import find_first_level1_title, find_year_and_slug_on_filename, find_extract_in_content_org
|
||||
from utils import get_blog_template_conf
|
||||
from website_config import configs_sites
|
||||
|
||||
# Configuration des arguments de la ligne de commande
|
||||
parser = argparse.ArgumentParser(description="Générer un nouvel article en mode orgmode.")
|
||||
parser = argparse.ArgumentParser(description="Générer un flux Atom des articles.")
|
||||
parser.add_argument("blog", help="Le nom du dossier de blog.")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
website_ndd = configs_sites[args.blog]['NDD']
|
||||
blog = 'sources/'+args.blog+'/lang_fr/'
|
||||
template_content = get_blog_template_conf(args.blog)
|
||||
website_ndd = template_content['NDD']
|
||||
|
||||
# Expression régulière pour extraire la date du contenu de l'article
|
||||
date_regex = re.compile(r"\b(\d{14})\b")
|
||||
date_regex_org = re.compile(r"\b(\d{4}-\d{2}-\d{2})\b")
|
||||
# Charger les données du fichier articles_info.json
|
||||
json_file = f'sources/{args.blog}/build/articles_info.json'
|
||||
with open(json_file, 'r', encoding='utf-8') as f:
|
||||
articles_info = json.load(f)
|
||||
|
||||
# Liste des fichiers org-mode trouvés
|
||||
org_files = []
|
||||
|
||||
limit_articles_feed=1000
|
||||
count_articles=0
|
||||
|
||||
# Parcourt le dossier source à la recherche de fichiers org-mode
|
||||
for root, dirs, files in os.walk(blog):
|
||||
for file in files:
|
||||
if file.endswith(".org"):
|
||||
date_str, annee, slug = find_year_and_slug_on_filename(file)
|
||||
with open(os.path.join(root, file), "r", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
extract = find_extract_in_content_org(content)
|
||||
count_articles+=1
|
||||
match = date_regex_org.search(date_str)
|
||||
if match:
|
||||
date = datetime.strptime(match.group(1), "%Y-%m-%d")
|
||||
org_files.append((date, os.path.join(root, file), annee, slug,extract))
|
||||
|
||||
if count_articles > limit_articles_feed:
|
||||
break
|
||||
if count_articles > limit_articles_feed:
|
||||
break
|
||||
|
||||
org_files.sort(reverse=True)
|
||||
# Trier les articles par date décroissante
|
||||
sorted_articles = sorted(articles_info.values(), key=lambda x: x['date'], reverse=True)
|
||||
|
||||
# Génération du flux Atom
|
||||
atom_feed = {"title": "Flux Atom des articles de "+args.blog,
|
||||
"link": f"{website_ndd}/feed",
|
||||
"updated": org_files[0][0],
|
||||
"entries": []}
|
||||
|
||||
for date, file, annee, slug, extract in org_files:
|
||||
# Parse le fichier org-mode pour extraire le titre, la description et la date de publication
|
||||
with open(file, "r", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
title = find_first_level1_title(content)
|
||||
description = title
|
||||
# published = date_str
|
||||
# Ajoute l'article au flux Atom
|
||||
atom_entry = {"title": title,
|
||||
"summary": extract,
|
||||
"link": f"{website_ndd}/{annee}/{slug}",
|
||||
"published": date
|
||||
}
|
||||
atom_feed["entries"].append(atom_entry)
|
||||
|
||||
# Enregistrement du flux Atom dans un fichier XML
|
||||
# Le flux Atom doit contenir:
|
||||
# - Un ID unique pour le flux et chaque entrée
|
||||
# - Une balise author avec name et email
|
||||
# - Les dates au format ISO 8601 avec timezone
|
||||
# - Un lien self vers le fichier XML
|
||||
with open(f"index_{args.blog}.xml", "w", encoding="utf-8") as f:
|
||||
with open(f"html-websites/{args.blog}/feed/index.xml", "w", encoding="utf-8") as f:
|
||||
f.write('<?xml version="1.0" encoding="UTF-8"?>\n')
|
||||
f.write('<feed xmlns="http://www.w3.org/2005/Atom">\n')
|
||||
f.write(f' <title>{atom_feed["title"]}</title>\n')
|
||||
f.write(f' <link href="{atom_feed["link"]}"/>\n')
|
||||
f.write(f' <updated>{atom_feed["updated"]}</updated>\n')
|
||||
f.write(f' <title>Flux Atom des articles de {args.blog}</title>\n')
|
||||
f.write(f' <link href="{website_ndd}/feed"/>\n')
|
||||
f.write(f' <updated>{datetime.fromisoformat(sorted_articles[0]["date"]).strftime("%Y-%m-%dT%H:%M:%S+00:00")}</updated>\n')
|
||||
f.write(' <id>tag:' + website_ndd + ',2023:/feed</id>\n')
|
||||
f.write(' <author>\n')
|
||||
f.write(' <name>Auteur du blog</name>\n')
|
||||
f.write(' <email>auteur@example.com</email>\n')
|
||||
f.write(f' <name>{configs_sites[args.blog]["AUTHOR"]}</name>\n')
|
||||
f.write(f' <email>{configs_sites[args.blog]["EMAIL"]}</email>\n')
|
||||
f.write(' </author>\n')
|
||||
f.write(f' <link rel="self" href="{website_ndd}/feed"/>\n')
|
||||
|
||||
for entry in atom_feed["entries"]:
|
||||
slug_id = entry["title"].lower().replace(" ", "-").replace("'", "-").replace("--", "-")
|
||||
with open(file, "r", encoding="utf-8") as article_file:
|
||||
article_content = article_file.read()
|
||||
|
||||
f.write(' <entry>\n')
|
||||
f.write(f' <id>tag:{website_ndd},2023:{entry["link"]}</id>\n')
|
||||
f.write(f' <title>{entry["title"]}</title>\n')
|
||||
f.write(f' <link href="{entry["link"]}"/>\n')
|
||||
f.write(' <content type="html"><![CDATA[\n')
|
||||
f.write(f' {article_content}\n')
|
||||
f.write(' ]]></content>\n')
|
||||
f.write(f' <summary>{entry["summary"]}</summary>\n')
|
||||
f.write(f' <published>{entry["published"].strftime("%Y-%m-%dT%H:%M:%S+00:00")}</published>\n')
|
||||
f.write(f' <updated>{entry["published"].strftime("%Y-%m-%dT%H:%M:%S+00:00")}</updated>\n')
|
||||
f.write(' <author>\n')
|
||||
f.write(f" <name>{configs_sites[args.blog]['AUTHOR']}</name>\n")
|
||||
f.write(f" <email>{configs_sites[args.blog]['EMAIL']}</email>\n")
|
||||
f.write(' </author>\n')
|
||||
f.write(' </entry>\n')
|
||||
f.write('</feed>')
|
||||
# Boucle des articles
|
||||
for article in sorted_articles:
|
||||
f.write(' <entry>\n')
|
||||
f.write(f' <id>tag:{website_ndd},2023:{article["slug"]}</id>\n')
|
||||
f.write(f' <title>{article["title"]}</title>\n')
|
||||
f.write(f' <link href="{website_ndd}/{article["slug"]}"/>\n')
|
||||
f.write(' <content type="html"><![CDATA[\n')
|
||||
f.write(f' {article["html_content"]}\n')
|
||||
f.write(' ]]></content>\n')
|
||||
f.write(f' <summary>{article.get("extract", "")}</summary>\n')
|
||||
f.write(f' <published>{datetime.fromisoformat(article["date"]).strftime("%Y-%m-%dT%H:%M:%S+00:00")}</published>\n')
|
||||
f.write(f' <updated>{datetime.fromisoformat(article["date"]).strftime("%Y-%m-%dT%H:%M:%S+00:00")}</updated>\n')
|
||||
f.write(' <author>\n')
|
||||
f.write(f' <name>{configs_sites[args.blog]["AUTHOR"]}</name>\n')
|
||||
f.write(f' <email>{configs_sites[args.blog]["EMAIL"]}</email>\n')
|
||||
f.write(' </author>\n')
|
||||
f.write(' </entry>\n')
|
||||
|
||||
f.write('</feed>')
|
||||
|
@ -87,6 +87,7 @@ for website_name in "${blogs_folders[@]}"; do
|
||||
|
||||
# conversion des pages statiques
|
||||
python3 linking_articles_prev_next.py $website_name
|
||||
|
||||
# créer les pages de tags à partir des infos de tag trouvées dans les fichiers org
|
||||
python3 gather_tags_in_json.py $website_name
|
||||
|
||||
|
@ -121,18 +121,47 @@ if generate_linkings_json :
|
||||
|
||||
gemini_content = ''
|
||||
html_content = ''
|
||||
# Vérifier l'existence du fichier HTML pour déterminer last_html_build
|
||||
html_path = f"html_websites/{args.blog}/{annee}/{slug}/index.html"
|
||||
last_html_build = None
|
||||
if os.path.exists(html_path):
|
||||
last_html_build = time.ctime(os.path.getmtime(html_path))
|
||||
# Vérifier l'existence du fichier Gemini pour déterminer last_gemini_build
|
||||
gemini_path = f"gemini-capsules/{args.blog}/{slug}.gmi"
|
||||
last_gemini_build = None
|
||||
rebuild_this_article_gemini = False
|
||||
if os.path.exists(gemini_path):
|
||||
last_gemini_build = time.ctime(os.path.getmtime(gemini_path))
|
||||
# Vérifier si l'article doit être reconstruit en comparant les dates de modification
|
||||
if last_gemini_build:
|
||||
file_modified_time = os.path.getmtime(file_path)
|
||||
last_build_time = time.mktime(time.strptime(last_gemini_build))
|
||||
rebuild_this_article_gemini = file_modified_time > last_build_time
|
||||
else:
|
||||
rebuild_this_article_gemini = True
|
||||
|
||||
if run_pandoc:
|
||||
# Vérifier si l'article doit être reconstruit en comparant les dates de modification
|
||||
rebuild_this_article_html = False
|
||||
if last_html_build:
|
||||
file_modified_time = os.path.getmtime(file_path)
|
||||
last_build_time = time.mktime(time.strptime(last_html_build))
|
||||
rebuild_this_article_html = file_modified_time > last_build_time
|
||||
else:
|
||||
rebuild_this_article_html = True
|
||||
|
||||
if run_pandoc and rebuild_this_article_html:
|
||||
# convertir le contenu d'article org vers html
|
||||
html_content = pypandoc.convert_text(content_without_h1, 'html', format='org')
|
||||
else:
|
||||
html_content = content_without_h1
|
||||
|
||||
if run_gemini:
|
||||
if run_gemini and rebuild_this_article_gemini:
|
||||
os.makedirs(destination_gmi, exist_ok=True)
|
||||
# convertir le contenu d'article org vers gmi pour la capsule gemini
|
||||
gemini_content = org_to_gmi(content_without_h1, slug)
|
||||
|
||||
|
||||
|
||||
files_dict[f"{annee}/{slug}"] = {
|
||||
'path': file_path,
|
||||
'basename': basename,
|
||||
@ -148,6 +177,7 @@ if generate_linkings_json :
|
||||
'title': title,
|
||||
'next': None,
|
||||
'previous': None,
|
||||
'last_html_build': last_html_build
|
||||
'org_content': content, # Contenu Org original
|
||||
'html_content_without_h1': re.sub(r'<h1>.*?</h1>', '', html_content), # Contenu HTML converti sans le titre de premier niveau
|
||||
'html_content': html_content # Contenu HTML converti
|
||||
|
@ -56,7 +56,6 @@ if not os.path.exists(blog_path):
|
||||
exit(1)
|
||||
|
||||
|
||||
uuid_value=''
|
||||
# Génération du nom de fichier org avec la date et le slug
|
||||
now = datetime.now()
|
||||
# date_string = now.strftime("%Y-%m-%d")
|
||||
@ -80,24 +79,30 @@ import uuid
|
||||
|
||||
def create_uuid_property():
|
||||
uuid_value = uuid.uuid4()
|
||||
return f":PROPERTIES:\n:ID: {uuid_value}\n:END:\n"
|
||||
return uuid_value
|
||||
|
||||
# Écriture du fichier org
|
||||
with open(filename, "w") as f:
|
||||
uuid = create_uuid_property()
|
||||
f.write(f"""
|
||||
#+title: {slug}
|
||||
:PROPERTIES:
|
||||
:ID: {uuid}
|
||||
:END:
|
||||
|
||||
|
||||
#+title: {args.title}
|
||||
#+post_ID:
|
||||
#+post_slug: organisation-de-taches-orgmode
|
||||
#+post_url: https://www.ciperbliss.com/{now.year}/{slug}
|
||||
#+post_title: {args.title}
|
||||
#+post_tags:
|
||||
#+post_series:
|
||||
#+post_series:
|
||||
#+post_type: post
|
||||
#+post_status: publish
|
||||
#+post_date_published: <{date_string_full}>
|
||||
#+post_date_modified: <{date_string_full}>
|
||||
#+post_index_page_roam_id: {uuid_value}
|
||||
#+BLOG: cipherbliss_blog {args.blog_dir}
|
||||
#+post_index_page_roam_id: {uuid}
|
||||
#+BLOG: {args.blog_dir}
|
||||
|
||||
* {args.title}
|
||||
|
||||
|
41
stats.py
Normal file
41
stats.py
Normal file
@ -0,0 +1,41 @@
|
||||
#!/bin/python3
|
||||
# Générer des statistiques sur tous les sites web
|
||||
|
||||
from utils import get_stats_on_all_websites
|
||||
from website_config import configs_sites
|
||||
|
||||
def main():
|
||||
"""
|
||||
Fonction principale qui génère les statistiques pour tous les sites web configurés
|
||||
"""
|
||||
print("Génération des statistiques pour tous les sites web...")
|
||||
lecture_mots_par_minute = 150
|
||||
# Récupérer les statistiques pour tous les sites
|
||||
stats = get_stats_on_all_websites()
|
||||
|
||||
# Afficher les statistiques pour chaque site
|
||||
for site_name, site_stats in stats.items():
|
||||
print(f"\n=== Statistiques pour {site_name} ===")
|
||||
print(f"Nombre total d'articles: {site_stats['nb_articles']}")
|
||||
# Formater le nombre de mots avec séparateur de milliers
|
||||
mots_formatte = f"{site_stats['nb_mots']:,}".replace(',', ' ')
|
||||
# Calculer le temps de lecture (150 mots/minute)
|
||||
|
||||
temps_lecture = site_stats['nb_mots'] / lecture_mots_par_minute
|
||||
heures = int(temps_lecture // 60)
|
||||
minutes = int(temps_lecture % 60)
|
||||
|
||||
# Si le temps de lecture dépasse 48h, convertir en jours
|
||||
if heures >= 48:
|
||||
jours = heures // 24
|
||||
heures = heures % 24
|
||||
temps_lecture_str = f"{jours}j {heures}h {minutes}min"
|
||||
else:
|
||||
temps_lecture_str = f"{heures}h {minutes}min" if heures > 0 else f"{minutes}min"
|
||||
|
||||
print(f"Nombre total de mots: {mots_formatte}")
|
||||
print(f"Temps de lecture estimé: {temps_lecture_str} (base: {lecture_mots_par_minute} mots/min)")
|
||||
print(f"Dernier article publié: {site_stats['dernier_article']}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
70
utils.py
70
utils.py
@ -291,8 +291,6 @@ def find_slug_in_file_basename(file_basename) -> str:
|
||||
if len(splitted) > 1:
|
||||
slug = splitted[len(splitted)-1]
|
||||
|
||||
# final_slug=slug.replace("_cipherbliss_blog_","")
|
||||
# final_slug=final_slug.replace("_blog_cil_gometz_","")
|
||||
slug=enlever_premier_tiret_ou_underscore(slug)
|
||||
|
||||
slug = f"{year}/{slug}"
|
||||
@ -302,7 +300,73 @@ def find_slug_in_file_basename(file_basename) -> str:
|
||||
return slug
|
||||
return None
|
||||
|
||||
|
||||
def get_stats_on_all_websites():
|
||||
"""
|
||||
Retourne des statistiques sur tous les sites web dans le dossier sources/.
|
||||
Pour chaque site, compte le nombre d'articles .org et trouve l'article le plus récent.
|
||||
|
||||
:return: Dictionnaire avec les stats par site
|
||||
"""
|
||||
stats = {}
|
||||
base_dir = "sources"
|
||||
|
||||
# Parcourir tous les dossiers de sites dans sources/
|
||||
for site in os.listdir(base_dir):
|
||||
site_path = os.path.join(base_dir, site)
|
||||
|
||||
if not os.path.isdir(site_path):
|
||||
continue
|
||||
|
||||
# Initialiser les stats pour ce site
|
||||
stats[site] = {
|
||||
'nb_articles': 0,
|
||||
'nb_mots': 0,
|
||||
'dernier_article': None,
|
||||
'date_dernier_article': None
|
||||
}
|
||||
|
||||
# Chercher les articles .org dans lang_fr et lang_en
|
||||
for lang in ['lang_fr', 'lang_en']:
|
||||
lang_path = os.path.join(site_path, lang)
|
||||
|
||||
if not os.path.exists(lang_path):
|
||||
continue
|
||||
|
||||
# Lister tous les fichiers .org
|
||||
org_files = [f for f in os.listdir(lang_path) if f.endswith('.org')]
|
||||
stats[site]['nb_articles'] += len(org_files)
|
||||
# Calculer le nombre total de mots pour ce dossier de langue
|
||||
total_mots = 0
|
||||
for org_file in org_files:
|
||||
file_path = os.path.join(lang_path, org_file)
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
contenu = f.read()
|
||||
# Compter les mots en divisant par les espaces
|
||||
total_mots += len(contenu.split())
|
||||
except Exception as e:
|
||||
print(f"Erreur lors de la lecture de {file_path}: {e}")
|
||||
|
||||
# Ajouter ou incrémenter le compteur de mots dans les stats
|
||||
stats[site]['nb_mots'] += total_mots
|
||||
|
||||
# Trouver le fichier le plus récent
|
||||
for org_file in org_files:
|
||||
file_path = os.path.join(lang_path, org_file)
|
||||
mod_time = os.path.getmtime(file_path)
|
||||
|
||||
if (stats[site]['date_dernier_article'] is None or
|
||||
mod_time > stats[site]['date_dernier_article']):
|
||||
stats[site]['date_dernier_article'] = mod_time
|
||||
stats[site]['dernier_article'] = file_path
|
||||
|
||||
# Convertir le timestamp en date lisible
|
||||
if stats[site]['date_dernier_article']:
|
||||
stats[site]['date_dernier_article'] = datetime.fromtimestamp(
|
||||
stats[site]['date_dernier_article']
|
||||
).strftime('%Y-%m-%d %H:%M:%S')
|
||||
|
||||
return stats
|
||||
|
||||
def convert_org_to_html(org_file, output_html_file):
|
||||
"""
|
||||
|
@ -229,19 +229,20 @@ default_config = {
|
||||
"NAVIGATION": """
|
||||
<nav>
|
||||
<a href="/">Accueil</a>
|
||||
<a href="/feed">Flux RSS</a>
|
||||
<a href="/tags">Tags</a>
|
||||
<a href="/contact">Contact</a>
|
||||
</nav>
|
||||
""",
|
||||
"BANNIERE_ENTETE": "https://www.cipherbliss.com/banner.jpg",
|
||||
"BANNIERE_ENTETE_ALT": "Bannière par défaut",
|
||||
"SERIES" : {
|
||||
"SERIE_1" : {
|
||||
"TITLE" : "Série 1",
|
||||
"ARTICLES" : [
|
||||
"2024/article-1" : {
|
||||
"TITLE" : "Article 1",
|
||||
"slug" : "2024/article-1",
|
||||
"SERIES": {
|
||||
"SERIE_1": {
|
||||
"TITLE": "Série 1",
|
||||
"ARTICLES": [
|
||||
{
|
||||
"TITLE": "Article 1",
|
||||
"slug": "2024/article-1"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user