up atom feed generate

This commit is contained in:
Tykayn 2024-11-18 11:18:50 +01:00 committed by tykayn
parent bff1ec1392
commit 791d93ecde
8 changed files with 69 additions and 25 deletions

1
.gitignore vendored
View File

@ -12,3 +12,4 @@ output/*.org
html-websites/* html-websites/*
pictures_done pictures_done
pictures_inbox/* pictures_inbox/*
/index*.xml

View File

@ -5,57 +5,78 @@ from datetime import datetime
# Chemin du dossier source # Chemin du dossier source
import argparse import argparse
from utils import find_first_level1_title, find_year_and_slug, find_extract_in_content_org
from website_config import configs_sites
# Configuration des arguments de la ligne de commande # Configuration des arguments de la ligne de commande
parser = argparse.ArgumentParser(description="Générer un nouvel article en mode orgmode.") parser = argparse.ArgumentParser(description="Générer un nouvel article en mode orgmode.")
parser.add_argument("blog_dir", help="Le nom du dossier de blog.") parser.add_argument("blog_dir", help="Le nom du dossier de blog.")
args = parser.parse_args() args = parser.parse_args()
blog_dir = 'sources/'+args.blog_dir website_ndd = configs_sites[args.blog_dir]['NDD']
blog_dir = 'sources/'+args.blog_dir+'/lang_fr/'
# Expression régulière pour extraire la date du contenu de l'article # Expression régulière pour extraire la date du contenu de l'article
date_regex = re.compile(r"\b(\d{14})\b") date_regex = re.compile(r"\b(\d{14})\b")
date_regex_org = re.compile(r"\b(\d{4}-\d{2}-\d{2})\b")
# Liste des fichiers org-mode trouvés # Liste des fichiers org-mode trouvés
org_files = [] org_files = []
limit_articles_feed=1000
count_articles=0
print('atom generate: fichiers dans le dossier: ',len((blog_dir)))
# Parcourt le dossier source à la recherche de fichiers org-mode # Parcourt le dossier source à la recherche de fichiers org-mode
for root, dirs, files in os.walk(blog_dir): for root, dirs, files in os.walk(blog_dir):
for file in files: for file in files:
if file.endswith(".org"): if file.endswith(".org"):
print(os.path.join(root, file))
date_str, annee, slug = find_year_and_slug(file)
# Ouvre le fichier et recherche la première date dans le contenu de l'article # Ouvre le fichier et recherche la première date dans le contenu de l'article
with open(os.path.join(root, file), "r", encoding="utf-8") as f: with open(os.path.join(root, file), "r", encoding="utf-8") as f:
content = f.read() content = f.read()
match = date_regex.search(content) extract = find_extract_in_content_org(content)
count_articles+=1
match = date_regex_org.search(content)
if match: if match:
date = datetime.strptime(match.group(1), "%Y-%m-%d") date = datetime.strptime(match.group(1), "%Y-%m-%d")
# Ajoute le fichier à la liste avec sa date correspondante # Ajoute le fichier à la liste avec sa date correspondante
org_files.append((date, os.path.join(root, file))) org_files.append((date, os.path.join(root, file), annee, slug,extract))
if count_articles > limit_articles_feed:
break
if count_articles > limit_articles_feed:
break
# Tri des fichiers par ordre décroissant de date # Tri des fichiers par ordre décroissant de date
org_files.sort(reverse=True) org_files.sort(reverse=True)
# Génération du flux Atom # Génération du flux Atom
atom_feed = {"title": "Flux Atom des articles GMI", atom_feed = {"title": "Flux Atom des articles de "+args.blog_dir,
"link": "http://www.example.com/atom", "link": f"{website_ndd}/feed",
"updated": org_files[0][0].strftime("%Y-%m-%dT%H:%M:%SZ"), # "updated": org_files[0][0].strftime("%Y-%m-%dT%H:%M:%SZ"),
"updated": org_files[0][0],
"entries": []} "entries": []}
for date, file in org_files: for date, file, annee, slug, extract in org_files:
# Parse le fichier org-mode pour extraire le titre, la description et la date de publication # Parse le fichier org-mode pour extraire le titre, la description et la date de publication
with open(file, "r", encoding="utf-8") as f: with open(file, "r", encoding="utf-8") as f:
content = f.read() content = f.read()
title = re.search(r"\*+ (.+)\n", content).group(1) title = find_first_level1_title(content)
description = re.search(r"\n+ (.+)\n", content, re.DOTALL).group(1) description = title
published = date.strftime("%Y-%m-%dT%H:%M:%SZ") # published = date_str
# Ajoute l'article au flux Atom # Ajoute l'article au flux Atom
atom_entry = {"title": title, "link": file, "summary": description, "published": published} atom_entry = {"title": title,
"summary": extract,
"link": f"{website_ndd}/{annee}/{slug}",
"published": date
}
atom_feed["entries"].append(atom_entry) atom_feed["entries"].append(atom_entry)
if published > atom_feed["updated"]: # if published > atom_feed["updated"]:
atom_feed["updated"] = published # atom_feed["updated"] = published
# Enregistrement du flux Atom dans un fichier # Enregistrement du flux Atom dans un fichier
with open("atom.xml", "w", encoding="utf-8") as f: with open(f"index_{args.blog_dir}.xml", "w", encoding="utf-8") as f:
f.write('<?xml version="1.0" encoding="UTF-8"?>\n') f.write('<?xml version="1.0" encoding="UTF-8"?>\n')
f.write('<feed xmlns="http://www.w3.org/2005/Atom">\n') f.write('<feed xmlns="http://www.w3.org/2005/Atom">\n')
f.write(f' <title>{atom_feed["title"]}</title>\n') f.write(f' <title>{atom_feed["title"]}</title>\n')

View File

@ -66,7 +66,7 @@ def generer_index(dossier_source, fichier_index):
article_name = trouver_nom_article(file_path_org, args.source, 'org') article_name = trouver_nom_article(file_path_org, args.source, 'org')
basename_file = os.path.basename(file_path_org) basename_file = os.path.basename(file_path_org)
article_relative_url = detect_slug_in_file_basename(basename_file) article_relative_url = find_slug_in_file_basename(basename_file)
if not article_name: if not article_name:
article_name = article_relative_url.replace('-', ' ') article_name = article_relative_url.replace('-', ' ')

View File

@ -77,6 +77,8 @@ generate_website() {
mkdir -p html-websites/$website_name mkdir -p html-websites/$website_name
mkdir -p html-websites/$website_name/feed
rm -rf html-websites/$website_name/* rm -rf html-websites/$website_name/*
rm -rf sources/$website_name/converted/* rm -rf sources/$website_name/converted/*
rm -rf sources/$website_name/lang_fr/converted/* rm -rf sources/$website_name/lang_fr/converted/*
@ -212,5 +214,7 @@ for website_name in "${blogs_folders[@]}"; do
# traiter les réductions d'images dans l'inbox # traiter les réductions d'images dans l'inbox
python3 pictures_resize.py python3 pictures_resize.py
python3 atom_generate.py $website_name
mv "index_$website_name.xml" "html-websites/$website_name/feed/index.xml"
done done

View File

@ -56,7 +56,7 @@ def enrich_one_file(html_content: str, partials: dict = {"header_page": "", "foo
css_content = f.read() css_content = f.read()
css_content = "<style type='text/css'>{css_content}</style>" css_content = "<style type='text/css'>{css_content}</style>"
template_content["CSS_INLINE_CONTENT"] = css_content template_content["CSS_INLINE_CONTENT"] = css_content
template_content["PAGE_SLUG"] = detect_slug_in_file_basename(file) template_content["PAGE_SLUG"] = find_slug_in_file_basename(file)
# remplir le template # remplir le template
html_content = f""" html_content = f"""
@ -70,7 +70,7 @@ def enrich_one_file(html_content: str, partials: dict = {"header_page": "", "foo
<meta property="og:description" content="{template_content['BLOG_SUBTITLE']}"> <meta property="og:description" content="{template_content['BLOG_SUBTITLE']}">
<meta property="og:url" content="{template_content['NDD']}"> <meta property="og:url" content="{template_content['NDD']}">
<meta property="og:site_name" content="{template_content['TITLE']}"> <meta property="og:site_name" content="{template_content['TITLE']}">
<link rel="alternate" type="application/rss+xml" title="Cipher Bliss » Flux" href="{template_content['NDD']}/feed/"> <link rel="alternate" type="application/atom+xml" title="Cipher Bliss » Flux" href="{template_content['NDD']}/feed/">
<link href="/style.css" rel="stylesheet"> <link href="/style.css" rel="stylesheet">
<script src="main_script.js"></script> <script src="main_script.js"></script>
<meta charset="utf-8"> <meta charset="utf-8">
@ -145,6 +145,7 @@ def enrich_one_file(html_content: str, partials: dict = {"header_page": "", "foo
<nav class="footer-nav"> <nav class="footer-nav">
{template_content['NAVIGATION']} {template_content['NAVIGATION']}
<a href="/tags/">Tags</a> <a href="/tags/">Tags</a>
<a href=""{template_content['NDD']}/feed/">Flux Atom</a>
</nav> </nav>
</div> </div>
</div> </div>

View File

@ -128,7 +128,7 @@ def generate_html_pages_for_all_tags(tag_to_files, html_output_folder):
""" """
for file_path_org in files: for file_path_org in files:
basename_file = os.path.basename(file_path_org) basename_file = os.path.basename(file_path_org)
slug = detect_slug_in_file_basename(basename_file) slug = find_slug_in_file_basename(basename_file)
if not slug: if not slug:
slug = trouver_slug_in_article(file_path_org) slug = trouver_slug_in_article(file_path_org)

View File

@ -14,17 +14,17 @@ from website_config import configs_sites
# tester la génération de slug sur un fichier org donné # tester la génération de slug sur un fichier org donné
basename_file = '20111126170159_cipherbliss_blog_120_bienvenue-sur-informageek.org' basename_file = '20111126170159_cipherbliss_blog_120_bienvenue-sur-informageek.org'
found_slug = detect_slug_in_file_basename(basename_file) found_slug = find_slug_in_file_basename(basename_file)
print('slug found:',found_slug) print('slug found:',found_slug)
basename_file = '20200803124344_blog_cil_gometz_11_ecrire-une-comptine-en-python-bonjour-le-jour.org' basename_file = '20200803124344_blog_cil_gometz_11_ecrire-une-comptine-en-python-bonjour-le-jour.org'
found_slug = detect_slug_in_file_basename(basename_file) found_slug = find_slug_in_file_basename(basename_file)
print('slug found:',found_slug) print('slug found:',found_slug)
basename_file = '20241115010205_cipherbliss_blog_suivi-de-rédaction-de-livre-orgmode.org' basename_file = '20241115010205_cipherbliss_blog_suivi-de-rédaction-de-livre-orgmode.org'
found_slug = detect_slug_in_file_basename(basename_file) found_slug = find_slug_in_file_basename(basename_file)
print('slug found:',found_slug) print('slug found:',found_slug)
basename_file = '20061125015032_tkblog_864_pourquoi-mee2-est-il-une-flamme.org' basename_file = '20061125015032_tkblog_864_pourquoi-mee2-est-il-une-flamme.org'
found_slug = detect_slug_in_file_basename(basename_file) found_slug = find_slug_in_file_basename(basename_file)
print('slug found:',found_slug) print('slug found:',found_slug)

View File

@ -66,7 +66,7 @@ def trouver_nom_article(fichier_org, blog_name, format="html"):
def find_year_and_slug(fichier): def find_year_and_slug(fichier):
fichier = fichier.replace('..', '.') fichier = fichier.replace('..', '.')
mylog(f" ------------ build_indexes: find in {fichier} -------------") mylog(f" ------------ build_indexes: find in {fichier} -------------")
slug = fichier.replace('.gmi', '') slug = fichier.replace('.gmi', '').replace('.org', '')
annee = '2024' annee = '2024'
date_str = '2024-00-00' date_str = '2024-00-00'
date = '2024-00-00' date = '2024-00-00'
@ -86,7 +86,6 @@ def find_year_and_slug(fichier):
else: else:
date = datetime.strptime(date_str, "%Y%m%d%H%M%S") date = datetime.strptime(date_str, "%Y%m%d%H%M%S")
date_string_replaced = str(date).replace(' 00:00:00', '') date_string_replaced = str(date).replace(' 00:00:00', '')
slug = fichier.replace('.gmi', '')
slug = slug.replace(date_string_replaced, '') slug = slug.replace(date_string_replaced, '')
slug = enlever_premier_tiret_ou_underscore(slug) slug = enlever_premier_tiret_ou_underscore(slug)
@ -155,6 +154,24 @@ def find_first_level1_title(content):
return match.group(1) return match.group(1)
return None return None
def find_extract_in_content_org(org_content):
# Supprimer les lignes qui commencent par #+
org_content = re.sub(r'^\s*#\+.*\n', '', org_content, flags=re.MULTILINE)
# Supprimer les sections de logbook
org_content = re.sub(r'^\*\* Logbook\n.*?(?=\*\* |\Z)', '', org_content, flags=re.DOTALL | re.MULTILINE)
# Supprimer les propriétés
org_content = re.sub(r'^:PROPERTIES:\n.*?:END:\n', '', org_content, flags=re.DOTALL | re.MULTILINE)
# Supprimer les lignes vides supplémentaires
org_content = re.sub(r'\n\s*\n+', '\n', org_content)
# Supprimer les espaces en début et fin de chaque ligne
org_content = '\n'.join(line.strip() for line in org_content.splitlines())
# Supprimer les espaces en début et fin du contenu final
return org_content.strip()
def extract_body_content(html_content): def extract_body_content(html_content):
pattern = r'<body.*?>(.*?)</body>' pattern = r'<body.*?>(.*?)</body>'
@ -199,7 +216,7 @@ def slugify_title(title_text):
title_text = title_text.strip('-') title_text = title_text.strip('-')
return title_text return title_text
def detect_slug_in_file_basename(file_basename) -> str: def find_slug_in_file_basename(file_basename) -> str:
""" """
Extrait l'année et le slug du nom de fichier selon le format spécifié. Extrait l'année et le slug du nom de fichier selon le format spécifié.