From 791d93ecdeb05321b0f8bdd0cd2b3d0de3fc0250 Mon Sep 17 00:00:00 2001 From: Tykayn Date: Mon, 18 Nov 2024 11:18:50 +0100 Subject: [PATCH] up atom feed generate --- .gitignore | 1 + atom_generate.py | 49 ++++++++++++++++++++++++++++++------------ build_indexes.py | 2 +- converters.sh | 4 ++++ enrich_html.py | 5 +++-- gather_tags_in_json.py | 2 +- testing.py | 8 +++---- utils.py | 23 +++++++++++++++++--- 8 files changed, 69 insertions(+), 25 deletions(-) diff --git a/.gitignore b/.gitignore index 9713f3bd..4dd8f2b2 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,4 @@ output/*.org html-websites/* pictures_done pictures_inbox/* +/index*.xml \ No newline at end of file diff --git a/atom_generate.py b/atom_generate.py index 228e9763..7194c69d 100755 --- a/atom_generate.py +++ b/atom_generate.py @@ -5,57 +5,78 @@ from datetime import datetime # Chemin du dossier source import argparse +from utils import find_first_level1_title, find_year_and_slug, find_extract_in_content_org +from website_config import configs_sites + # Configuration des arguments de la ligne de commande parser = argparse.ArgumentParser(description="Générer un nouvel article en mode orgmode.") parser.add_argument("blog_dir", help="Le nom du dossier de blog.") args = parser.parse_args() -blog_dir = 'sources/'+args.blog_dir +website_ndd = configs_sites[args.blog_dir]['NDD'] +blog_dir = 'sources/'+args.blog_dir+'/lang_fr/' # Expression régulière pour extraire la date du contenu de l'article date_regex = re.compile(r"\b(\d{14})\b") +date_regex_org = re.compile(r"\b(\d{4}-\d{2}-\d{2})\b") # Liste des fichiers org-mode trouvés org_files = [] +limit_articles_feed=1000 +count_articles=0 +print('atom generate: fichiers dans le dossier: ',len((blog_dir))) # Parcourt le dossier source à la recherche de fichiers org-mode for root, dirs, files in os.walk(blog_dir): for file in files: if file.endswith(".org"): + print(os.path.join(root, file)) + date_str, annee, slug = find_year_and_slug(file) # Ouvre le fichier et recherche la première date dans le contenu de l'article with open(os.path.join(root, file), "r", encoding="utf-8") as f: content = f.read() - match = date_regex.search(content) + extract = find_extract_in_content_org(content) + count_articles+=1 + match = date_regex_org.search(content) if match: date = datetime.strptime(match.group(1), "%Y-%m-%d") # Ajoute le fichier à la liste avec sa date correspondante - org_files.append((date, os.path.join(root, file))) + org_files.append((date, os.path.join(root, file), annee, slug,extract)) + if count_articles > limit_articles_feed: + break + if count_articles > limit_articles_feed: + break # Tri des fichiers par ordre décroissant de date org_files.sort(reverse=True) # Génération du flux Atom -atom_feed = {"title": "Flux Atom des articles GMI", - "link": "http://www.example.com/atom", - "updated": org_files[0][0].strftime("%Y-%m-%dT%H:%M:%SZ"), +atom_feed = {"title": "Flux Atom des articles de "+args.blog_dir, + "link": f"{website_ndd}/feed", + # "updated": org_files[0][0].strftime("%Y-%m-%dT%H:%M:%SZ"), + "updated": org_files[0][0], "entries": []} -for date, file in org_files: +for date, file, annee, slug, extract in org_files: # Parse le fichier org-mode pour extraire le titre, la description et la date de publication with open(file, "r", encoding="utf-8") as f: content = f.read() - title = re.search(r"\*+ (.+)\n", content).group(1) - description = re.search(r"\n+ (.+)\n", content, re.DOTALL).group(1) - published = date.strftime("%Y-%m-%dT%H:%M:%SZ") + title = find_first_level1_title(content) + description = title + # published = date_str # Ajoute l'article au flux Atom - atom_entry = {"title": title, "link": file, "summary": description, "published": published} + atom_entry = {"title": title, + "summary": extract, + "link": f"{website_ndd}/{annee}/{slug}", + "published": date + } atom_feed["entries"].append(atom_entry) - if published > atom_feed["updated"]: - atom_feed["updated"] = published + # if published > atom_feed["updated"]: + # atom_feed["updated"] = published # Enregistrement du flux Atom dans un fichier -with open("atom.xml", "w", encoding="utf-8") as f: +with open(f"index_{args.blog_dir}.xml", "w", encoding="utf-8") as f: f.write('\n') f.write('\n') f.write(f' {atom_feed["title"]}\n') diff --git a/build_indexes.py b/build_indexes.py index 9487454f..d556e1e8 100755 --- a/build_indexes.py +++ b/build_indexes.py @@ -66,7 +66,7 @@ def generer_index(dossier_source, fichier_index): article_name = trouver_nom_article(file_path_org, args.source, 'org') basename_file = os.path.basename(file_path_org) - article_relative_url = detect_slug_in_file_basename(basename_file) + article_relative_url = find_slug_in_file_basename(basename_file) if not article_name: article_name = article_relative_url.replace('-', ' ') diff --git a/converters.sh b/converters.sh index a56ad4c9..7a354c01 100755 --- a/converters.sh +++ b/converters.sh @@ -77,6 +77,8 @@ generate_website() { mkdir -p html-websites/$website_name + mkdir -p html-websites/$website_name/feed + rm -rf html-websites/$website_name/* rm -rf sources/$website_name/converted/* rm -rf sources/$website_name/lang_fr/converted/* @@ -212,5 +214,7 @@ for website_name in "${blogs_folders[@]}"; do # traiter les réductions d'images dans l'inbox python3 pictures_resize.py + python3 atom_generate.py $website_name + mv "index_$website_name.xml" "html-websites/$website_name/feed/index.xml" done diff --git a/enrich_html.py b/enrich_html.py index 949fb3cc..248a1c13 100755 --- a/enrich_html.py +++ b/enrich_html.py @@ -56,7 +56,7 @@ def enrich_one_file(html_content: str, partials: dict = {"header_page": "", "foo css_content = f.read() css_content = "" template_content["CSS_INLINE_CONTENT"] = css_content - template_content["PAGE_SLUG"] = detect_slug_in_file_basename(file) + template_content["PAGE_SLUG"] = find_slug_in_file_basename(file) # remplir le template html_content = f""" @@ -70,7 +70,7 @@ def enrich_one_file(html_content: str, partials: dict = {"header_page": "", "foo - + @@ -145,6 +145,7 @@ def enrich_one_file(html_content: str, partials: dict = {"header_page": "", "foo diff --git a/gather_tags_in_json.py b/gather_tags_in_json.py index 11b716b0..837819de 100644 --- a/gather_tags_in_json.py +++ b/gather_tags_in_json.py @@ -128,7 +128,7 @@ def generate_html_pages_for_all_tags(tag_to_files, html_output_folder): """ for file_path_org in files: basename_file = os.path.basename(file_path_org) - slug = detect_slug_in_file_basename(basename_file) + slug = find_slug_in_file_basename(basename_file) if not slug: slug = trouver_slug_in_article(file_path_org) diff --git a/testing.py b/testing.py index ca82653d..57f85830 100644 --- a/testing.py +++ b/testing.py @@ -14,17 +14,17 @@ from website_config import configs_sites # tester la génération de slug sur un fichier org donné basename_file = '20111126170159_cipherbliss_blog_120_bienvenue-sur-informageek.org' -found_slug = detect_slug_in_file_basename(basename_file) +found_slug = find_slug_in_file_basename(basename_file) print('slug found:',found_slug) basename_file = '20200803124344_blog_cil_gometz_11_ecrire-une-comptine-en-python-bonjour-le-jour.org' -found_slug = detect_slug_in_file_basename(basename_file) +found_slug = find_slug_in_file_basename(basename_file) print('slug found:',found_slug) basename_file = '20241115010205_cipherbliss_blog_suivi-de-rédaction-de-livre-orgmode.org' -found_slug = detect_slug_in_file_basename(basename_file) +found_slug = find_slug_in_file_basename(basename_file) print('slug found:',found_slug) basename_file = '20061125015032_tkblog_864_pourquoi-mee2-est-il-une-flamme.org' -found_slug = detect_slug_in_file_basename(basename_file) +found_slug = find_slug_in_file_basename(basename_file) print('slug found:',found_slug) diff --git a/utils.py b/utils.py index 37e1166a..fbdad6b3 100644 --- a/utils.py +++ b/utils.py @@ -66,7 +66,7 @@ def trouver_nom_article(fichier_org, blog_name, format="html"): def find_year_and_slug(fichier): fichier = fichier.replace('..', '.') mylog(f" ------------ build_indexes: find in {fichier} -------------") - slug = fichier.replace('.gmi', '') + slug = fichier.replace('.gmi', '').replace('.org', '') annee = '2024' date_str = '2024-00-00' date = '2024-00-00' @@ -86,7 +86,6 @@ def find_year_and_slug(fichier): else: date = datetime.strptime(date_str, "%Y%m%d%H%M%S") date_string_replaced = str(date).replace(' 00:00:00', '') - slug = fichier.replace('.gmi', '') slug = slug.replace(date_string_replaced, '') slug = enlever_premier_tiret_ou_underscore(slug) @@ -155,6 +154,24 @@ def find_first_level1_title(content): return match.group(1) return None +def find_extract_in_content_org(org_content): + # Supprimer les lignes qui commencent par #+ + org_content = re.sub(r'^\s*#\+.*\n', '', org_content, flags=re.MULTILINE) + + # Supprimer les sections de logbook + org_content = re.sub(r'^\*\* Logbook\n.*?(?=\*\* |\Z)', '', org_content, flags=re.DOTALL | re.MULTILINE) + + # Supprimer les propriétés + org_content = re.sub(r'^:PROPERTIES:\n.*?:END:\n', '', org_content, flags=re.DOTALL | re.MULTILINE) + + # Supprimer les lignes vides supplémentaires + org_content = re.sub(r'\n\s*\n+', '\n', org_content) + + # Supprimer les espaces en début et fin de chaque ligne + org_content = '\n'.join(line.strip() for line in org_content.splitlines()) + + # Supprimer les espaces en début et fin du contenu final + return org_content.strip() def extract_body_content(html_content): pattern = r'(.*?)' @@ -199,7 +216,7 @@ def slugify_title(title_text): title_text = title_text.strip('-') return title_text -def detect_slug_in_file_basename(file_basename) -> str: +def find_slug_in_file_basename(file_basename) -> str: """ Extrait l'année et le slug du nom de fichier selon le format spécifié.