auto detect tags and move pictures after resizing

2024-11-15 15:58:19 +01:00 · 2024-11-15 15:58:19 +01:00 · 618c029c62
commit 618c029c62
parent 9c72473913
8 changed files with 165 additions and 154 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,5 +1,6 @@
 **/converted/
 sources/*/converted
 sources/**/converted
 sources/*/lang_*/converted
 sources/**/*.html
 index_*.html
@ -8,3 +9,5 @@ __pycache__
 output/pictures
 output/*.org
 html-websites/*
 pictures_done
 pictures_inbox/*
--- a/README.md
+++ b/README.md
@ -5,12 +5,16 @@ Génération de capsules gemini et blog html à partir d'articles en fichiers Or
 exemple avec le blog de DragonFeu.
 `sources/dragonfeu_blog`
-Les articles sont écrits dans le dossier source/, leur nom permet de déduire l'ordre de publication grâce à une date YYY-MM-DD en début de nom de fichier
+Les articles sont écrits dans le dossier source/, leur nom permet de déduire l'ordre de publication grâce à une date YYY-MM-DD en début de nom de fichier.
 ## création d'un article 
-Pour un article en langue En, sur le blog cipherbliss_blog, donner le titre entre guillemets ainsi: 
+Pour un article en langue En (english), sur le blog cipherbliss_blog, donner le titre entre guillemets ainsi: 
 ```shell
 python3 new_article.py cipherbliss_blog en "Creation of a gemini blog"
 ```
 On peut se faire un alias de commande en bash pour n'avoir à remplir que le titre pour un blog donné.
 ## prérequis pour installation
 Avoir des articles au format org, les mettre dans un dossier source/nom_du_blog, et avoir quelques bibliothèques dispo:
--- a/build_indexes.py
+++ b/build_indexes.py
@ -1,9 +1,9 @@
 import argparse
 import datetime
 import os
 import re
 import shutil
 from utils import *
 from enrich_html import enrich_one_file
 from website_config import configs_sites, global_config
@ -22,120 +22,11 @@ source_files_extension = "org"
 config_title = configs_sites[args.source]['BLOG_TITLE']
 # Expression régulière pour extraire la date et le slug du nom de fichier org
 regex = r"^(\d{14})(-[a-zA-Z0-9_-]+)\.gmi$"
 regex_orgroam = r"^(\d{14})_([a-zA-Z0-9_-]+)\.gmi$"
 use_article_file_for_name = (not global_config["slug_with_year"])
 website_name = args.source
 def trouver_nom_article(fichier_org, format="html"):
    # print('fichier_org, ', fichier_org)
    with open(fichier_org, 'r') as file:
        lignes = file.readlines()
    nom_article = ''
    # print('trouver_nom_article format',format)
    # Expressions régulières pour trouver les titres de niveau 1 et 2
    if format == 'html':
        titre_niveau_1 = r'<h1\s+(?:id|data-created)="[^"]*">(.*?)</h1>'
        titre_niveau_2 = r'^\<h2.*?\>(.+)\<\/h2\>$'
    else:
        titre_niveau_1 = r'^\*+ (.+)$'
        titre_niveau_2 = r'^\*\*+ (.+)$'
    # Itérer sur les lignes du fichier
    for ligne in lignes:
        # Rechercher un titre de niveau 1
        titre_niveau_1_match = re.match(titre_niveau_1, ligne)
        if titre_niveau_1_match:
            titre_niveau_1_texte = titre_niveau_1_match.group(1)
            if titre_niveau_1_texte.lower() != "article" and titre_niveau_1_texte.lower() != "liens":
                nom_article = titre_niveau_1_texte
                break
            else:
                # Si le premier titre de niveau 1 est "Article", rechercher le premier titre de niveau 2
                titre_niveau_2_match = re.match(titre_niveau_2, ligne)
                if titre_niveau_2_match:
                    nom_article = titre_niveau_2_match.group(1)
                    break
    # print(f"Nom de l'article : {nom_article}")
    return nom_article.replace(args.source + '_', '').replace('_', ' ')
 def find_year_and_slug(fichier):
    fichier = fichier.replace('..', '.')
    # print(f" ------------ build_indexes: find in {fichier} -------------")
    slug = fichier.replace('.gmi', '')
    annee = '2024'
    date_str = '2024-00-00'
    date = '2024-00-00'
    match = re.match(regex_orgroam, fichier)
    if match:
        date_str = match.group(1)
        annee = date_str[:4]
        slug = match.group(2)
    match = re.match(regex, fichier)
    if match:
        date_str = match.group(1)
        # Convertir la date en objet datetime
        if "-" in date_str:
            date = datetime.datetime.strptime(date_str, "%Y-%m-%d")
        else:
            date = datetime.datetime.strptime(date_str, "%Y%m%d%H%M%S")
        date_string_replaced = str(date).replace(' 00:00:00', '')
        slug = fichier.replace('.gmi', '')
        slug = slug.replace(date_string_replaced, '')
        slug = enlever_premier_tiret_ou_underscore(slug)
        annee = str(date.year).replace(' 00:00:00', '')
    # else:
    #     print('ERREUR aucun slug trouvé')
    # print(f" ------------ build_indexes:  ")
    # print(f" ------------ build_indexes: Fichier: {fichier}")
    # print(f" ------------ build_indexes: année: {annee}")
    # print(f" ------------ build_indexes: str(date): {str(date)}")
    # print(f" ------------ build_indexes: slug: {slug}")
    # print(f" ------------ build_indexes: chemin: {annee}/{slug}/")
    return [date_str, annee, slug]
 def enlever_premier_tiret_ou_underscore(chaîne):
    if chaîne.startswith('-') or chaîne.startswith('_'):
        chaîne = chaîne[1:]
    return chaîne
 # création des dossiers intermédiaires s'il y en a
 # déplace le fichier dans le dossier spécifié
 def create_path_folders_and_move_file(path, file):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    shutil.move(file, path)
 def get_files_list_of_folder(folder_path):
    # Vérifie si le dossier existe
    if not os.path.exists(folder_path):
        print(f" ------------ build_indexes: Erreur : Le dossier '{folder_path}' n'existe pas.")
        return
    # print('----------- get_files_list_of_folder: folder_path : ', folder_path)
    # Liste les fichiers articles, trie par nom décroissant
    try:
        fichiers_md = sorted([f.replace('.' + source_files_extension, '.gmi') for f in os.listdir(folder_path) if
                              f.endswith(source_files_extension)], reverse=True)
        print('fichiers trouvés:', len(fichiers_md))
        return fichiers_md
    except OSError as e:
        print(f" ------------ build_indexes: Erreur lors de la lecture du dossier : {e}")
        return
 # transformer le nom du fichier orgmode en une création de dossier de l'année, et un sous dossier du nom du slug dans le nom de fichier, contenant un seul fichier d'index afin de faire de l'url rewrite en dur.
 # le nom de fichier org commence par une date YYYY-MM-DD ou bien YYYYMMDDHHmmss, est suivie d'un slug, et finit par l'extension .org
@ -179,7 +70,7 @@ def generer_index(dossier_source, fichier_index):
        link_org = fichier.replace('.gmi', '.org')
        file_path_org = os.path.join(dossier_parent, "sources", website_name, lang_folder, link_org)
-        article_name = trouver_nom_article(file_path_org, 'org')
+        article_name = trouver_nom_article(file_path_org,args.source, 'org')
        if not article_name:
            article_name = slug.replace('-', ' ')
@ -200,11 +91,11 @@ def generer_index(dossier_source, fichier_index):
    contenu_index_html += "<hr/>"
    contenu_index_html += "<h1>Navigation</h1>"
    for fichier in files_static:
-        # print(" -------- fichier ", fichier)
+        mylog(" -------- fichier ", fichier)
        link_html = fichier.replace('.gmi', '.html')
        link_org = fichier.replace('.gmi', '.org')
        file_path_org = os.path.join(dossier_parent, "sources", website_name, link_org)
-        article_name = trouver_nom_article(file_path_org, 'org')
+        article_name = trouver_nom_article(file_path_org,args.source, 'org')
        if article_name:
            contenu_index_gmi += f"=> {fichier} {article_name}\n"
@ -212,11 +103,11 @@ def generer_index(dossier_source, fichier_index):
            contenu_index_gmi += f"=> {fichier}\n"
        if fichier != "index.gmi":
-            # print(' -------- rechercher le nom de l article dans le fichier ')
+            mylog(' -------- rechercher le nom de l article dans le fichier ')
            if use_article_file_for_name:
                article_name = link_html
            else:
-                article_name = trouver_nom_article(file_path_org, 'org')
+                article_name = trouver_nom_article(file_path_org,args.source, 'org')
            if not article_name:
                article_name = link_html
--- a/converters.sh
+++ b/converters.sh
@ -146,7 +146,7 @@ convert_markdown_to_gmi() {
    # pages en
    for fichier in sources/$website_name/converted/*.md ; do
        if [ -f "$fichier" ]; then
-#            echo "----------- convert_markdown_to_gmi : md2gemini : $destination_gemini : $fichier"
+            echo "----------- convert_markdown_to_gmi : md2gemini : $destination_gemini : $fichier"
            md2gemini "$fichier" -w -d $destination_gemini/
        fi
    done
@ -154,7 +154,7 @@ convert_markdown_to_gmi() {
    for fichier in sources/$website_name/lang_fr/converted/*.md ; do
        if [ -f "$fichier" ]; then
-#            echo "----------- convert_markdown_to_gmi : md2gemini : $fichier"
+            echo "----------- convert_markdown_to_gmi : md2gemini : $fichier"
            md2gemini "$fichier" -w -d sources/$website_name/lang_fr/converted/
        fi
    done
@ -162,7 +162,7 @@ convert_markdown_to_gmi() {
    for fichier in sources/$website_name/lang_en/converted/*.md ; do
        if [ -f "$fichier" ]; then
-#            echo "----------- convert_markdown_to_gmi : md2gemini : $fichier"
+            echo "----------- convert_markdown_to_gmi : md2gemini : $fichier"
            md2gemini "$fichier" -w -d sources/$website_name/lang_fr/converted/
        fi
    done
@ -197,6 +197,9 @@ for website_name in "${blogs_folders[@]}"; do
  # sauver le tout dans un fichier index.gmi
  python3 build_indexes.py $website_name
  # créer les pages de tags
  python3 gather_tags_in_json.py $website_name
  # déplacer les fichiers générés en html dans le dossier statique
  mv sources/$website_name/converted/*.html html-websites/$website_name/
@ -205,4 +208,7 @@ for website_name in "${blogs_folders[@]}"; do
  # copier le style dans le dossier html
  cp $style_file html-websites/$website_name/style.css
  # traiter les réductions d'images dans l'inbox
  python3 pictures_resize.py
 done
--- a/deploy.sh
+++ b/deploy.sh
@ -0,0 +1,2 @@
 # copier les images vers le serveur qui héberge les blogs
 rsync -avzp output/pictures/* tykayn@proxmox.coussinet.org:/poule/encrypted/www/tykayn-inbox/blogs-output-pictures
--- a/enrich_html.py
+++ b/enrich_html.py
@ -22,7 +22,7 @@ static_page_path = f"{source_blog}/templates/html/static.html"
 enable_header=True
-# print('---------- blog name ', blog_name)
+mylog('---------- blog name ', blog_name)
 template_content = configs_sites[blog_name]
 after_article = ''
@ -51,7 +51,7 @@ def enrich_one_file(html_content: str, partials: dict = {"header_page": "", "foo
    if inline_the_css is True:
-        print(' ----------- enrich_html: include css inline in each html page')
+        mylog(' ----------- enrich_html: include css inline in each html page')
        with open(os.path.join(root_path, file), "r") as f:
            css_content = f.read()
            css_content = "<style type='text/css'>{css_content}</style>"
@ -144,6 +144,7 @@ def enrich_one_file(html_content: str, partials: dict = {"header_page": "", "foo
                        </div>
                        <nav class="footer-nav">
                            {template_content['NAVIGATION']}
                            <a href="/tags/">Tags</a>
                        </nav>
                    </div>
                </div>
@ -169,8 +170,9 @@ def ouvrir_fichier(chemin_fichier):
        raise FileNotFoundError(f"Le fichier {chemin_fichier} n'existe pas.")
 liste_fichiers_du_blog_convertis = os.walk(html_pages)
-print('fichiers à enrichir:', liste_fichiers_du_blog_convertis)
+mylog('fichiers à enrichir:', liste_fichiers_du_blog_convertis)
 count_articles=0
 # Parcourir tous les fichiers HTML dans le dossier du blog donné
 for root_path, dirs, files in liste_fichiers_du_blog_convertis :
@ -185,14 +187,14 @@ for root_path, dirs, files in liste_fichiers_du_blog_convertis :
    partials["footer_content"] = ouvrir_fichier(os.path.join('sources',blog_name, 'templates', 'footer_page.org'))
    for file in files:
-        print(file)
+        # mylog(file)
-        if file is "index.html":
+        # if file == "index.html":
-            template_content['no_header']
+        #     template_content['no_header']=True
        if file.endswith(".html"):
-            print(' ----------- enrich_html: file:', os.path.join(root_path, file))
+            # mylog(' ----------- enrich_html: file:'+ os.path.join(root_path, file))
-
+            count_articles+=1
-            print(' ----------- enrich_html: CSS inline: ', inline_the_css)
+            mylog(' ----------- enrich_html: CSS inline: ', inline_the_css)
            # Ouvrir le fichier HTML en mode lecture
            with open(os.path.join(root_path, file), "r") as f:
@ -200,8 +202,10 @@ for root_path, dirs, files in liste_fichiers_du_blog_convertis :
            html_content = enrich_one_file(html_content, partials)
            html_path_enriched = os.path.join(root_path, file)
-            print(' ----------- enrich_html: html_path_enriched ============> ', html_path_enriched)
+            mylog(' ----------- enrich_html: html_path_enriched ============> ', html_path_enriched)
            # Écrire le contenu modifié dans le fichier HTML
            with open(html_path_enriched, "w") as f:
                f.write(html_content)
-                print('\n ----------- enrich_html: html écrit ', html_path_enriched)
+                mylog('\n ----------- enrich_html: html écrit ', html_path_enriched)
 print('articles listés :',count_articles)
--- a/gather_tags_in_json.py
+++ b/gather_tags_in_json.py
@ -3,6 +3,9 @@ import json
 import os
 from collections import defaultdict
 from utils import *
 from website_config import *
 parser = argparse.ArgumentParser(description="Générer un site Web à partir de fichiers HTML.")
 parser.add_argument("blog_name", help="Le chemin vers le dossier contenant les fichiers HTML.")
@ -15,6 +18,7 @@ directory_fr = f'{directory_base}/lang_fr'  # Remplacez par le chemin de votre d
 output_file = f'sources/{blog_folder}/converted/tags.json'  # Fichier de sortie
 html_output_folder = f'html-websites/{blog_folder}/tags'  # Dossier de sortie pour les fichiers HTML
 excluded_tags = {'PROPERTIES', 'CREATED', 'ID', 'END'}
 automatic_tagging_enabled=True
 count_orgfiles = 0
@ -33,15 +37,47 @@ def find_org_files(directory):
    return org_files
 def add_tags_from_content(tags=None, file_content="", words_to_check=None):
    """
    Ajoute des tags à l'ensemble `tags` si les mots correspondants sont trouvés dans le contenu du fichier.
    :param tags: Ensemble de tags (set). Si None, un nouvel ensemble est créé (type set, optionnel).
    :param file_content: Contenu du fichier (str).
    :param words_to_check: Liste de mots à repérer (list). Si None, une liste vide est utilisée (type list, optionnel).
    :return: Ensemble de tags mis à jour (set).
    """
    # Initialiser l'ensemble tags s'il est None
    if tags is None:
        tags = set()
    # Initialiser la liste words_to_check s'il est None
    if words_to_check is None:
        words_to_check = []
    # Convertir le contenu du fichier en minuscules pour une recherche insensible à la casse
    file_content_lower = file_content.lower()
    # Parcourir chaque mot à vérifier
    for word in words_to_check:
        # Vérifier si le mot est présent dans le contenu du fichier
        if word.lower() in file_content_lower:
            # Ajouter le tag correspondant à l'ensemble de tags
            tags.add(word)
    return tags
 def extract_tags_from_file(file_path, excluded_tags, count_not_tagged_files=0):
    tags = set()
-    with open(file_path, 'r', encoding='utf-8') as file:
+    with open(file_path, 'r', encoding='utf-8') as file_content:
        tag_found = False
-        for line in file:
+        for line in file_content:
            if automatic_tagging_enabled:
                tags = add_tags_from_content(tags, line, global_config['auto_tag_terms'])
            # Check for orgmode tags :tag1:tag2:
            if ':' in line:
                for word in line.split():
-                    if word.startswith(':') and word.endswith(':'):
+                    if len(word) and word.startswith(':') and word.endswith(':'):
                        tag = word[1:-1]
                        if tag not in excluded_tags:
                            tags.add(tag)
@ -53,6 +89,8 @@ def extract_tags_from_file(file_path, excluded_tags, count_not_tagged_files=0):
                    if tag and tag not in excluded_tags:
                        tags.add(tag)
                        tag_found = True
    if not tag_found:
        count_not_tagged_files = count_not_tagged_files + 1
        print('no tag in the article', file_path)
@ -73,7 +111,7 @@ def save_to_json(tag_to_files, output_file):
        json.dump({tag: list(files) for tag, files in tag_to_files.items()}, json_file, ensure_ascii=False, indent=4)
-def generate_html_pages(tag_to_files, html_output_folder):
+def generate_html_pages_for_all_tags(tag_to_files, html_output_folder):
    if not os.path.exists(html_output_folder):
        os.makedirs(html_output_folder)
@ -84,15 +122,20 @@ def generate_html_pages(tag_to_files, html_output_folder):
        <head>
            <meta charset="UTF-8">
            <meta name="viewport" content="width=device-width, initial-scale=1.0">
-            <title>Fichiers .org pour le tag {tag}</title>
+            <title>Articles comportant le tag "{tag}"</title>
        </head>
        <body>
-            <h1>Fichiers .org pour le tag {tag}</h1>
+            <h1>Articles comportant le tag "{tag}"</h1>
            <ul>
        """
-        for file_path in files:
+        for file_path_org in files:
-            relative_path = os.path.relpath(file_path, start=directory_fr)
+            basename_file= os.path.basename(file_path_org)
-            html_content += f"                <li><a href='{relative_path}'>{relative_path}</a></li>\n"
+
            nom_article = trouver_nom_article( file_path_org, blog_folder, 'org')
            slug = trouver_slug_in_article(file_path_org)
            print('nom_article',nom_article)
            html_content += f"                <li><a href='/{slug}'>{nom_article}</a></li>\n"
        html_content += """
            </ul>
@ -108,6 +151,29 @@ def generate_html_pages(tag_to_files, html_output_folder):
    print(f"Pages HTML générées dans {html_output_folder}")
 def trouver_slug_in_article(file_path_org):
    """
    Trouve le contenu de la variable #+slug: dans un fichier Org.
    :param file_path_org: Chemin complet du fichier Org (str).
    :return: Valeur du slug (str) ou None si non trouvé.
    """
    slug = None
    try:
        with open(file_path_org, 'r', encoding='utf-8') as file:
            for line in file:
                if line.startswith("#+slug:"):
                    slug = line[len("#+slug:"):].strip()
                    break
    except FileNotFoundError:
        print(f"Le fichier {file_path_org} n'a pas été trouvé.")
    except Exception as e:
        print(f"Une erreur s'est produite : {e}")
    return slug
 def generate_index_page(tag_to_files, html_output_folder):
    index_content = f"""
    <!DOCTYPE html>
@ -148,7 +214,7 @@ if __name__ == "__main__":
    tag_to_files = group_files_by_tags(org_files, excluded_tags, count_not_tagged_files)
    save_to_json(tag_to_files, output_file)
-    generate_html_pages(tag_to_files, html_output_folder)
+    generate_html_pages_for_all_tags(tag_to_files, html_output_folder)
    generate_index_page(tag_to_files, html_output_folder)
    print(f"Tags et fichiers associés ont été enregistrés dans {output_file}")
--- a/pictures_resize.py
+++ b/pictures_resize.py
@ -1,26 +1,53 @@
 # on redimensionne nos images qui ullustreront les articles,
 # pour cela on les copie dans notre dossier pictures_inbox.
 # elles sont copiées en version réduite, et déplacées dans le dossier de l'année
 # ce script génère aussi de quoi copier des liens org affichant les thumbnails de l'image liée à la grande version
 import os
 from PIL import Image
 from datetime import datetime
 from PIL import Image, ExifTags
 # Variables
 INPUT_DIR = "pictures_inbox"
 DONE_DIR = "pictures_done"
 OUTPUT_DIR = "output/pictures"
 YEAR = datetime.now().strftime("%Y")
 SMALL_SUFFIX = "_small"
 IMAGE_FORMAT = "jpg"
 max_width_resized=600 # pixels max de largeur
 url_folder_pics="https://www.tykayn.fr/wp-uploads/content/i"
 # Créer le dossier de sortie s'il n'existe pas
 os.makedirs(os.path.join(OUTPUT_DIR, YEAR), exist_ok=True)
 def get_exif_orientation(image):
    """Obtenir l'orientation de l'image à partir des métadonnées EXIF."""
    try:
        exif = image._getexif()
        if exif is not None:
            for orientation in ExifTags.TAGS.keys():
                if ExifTags.TAGS[orientation] == 'Orientation':
                    return exif[orientation]
    except AttributeError:
        pass
    return None
 def apply_orientation(image, orientation):
    """Appliquer l'orientation à l'image."""
    if orientation == 3:
        return image.rotate(180, expand=True)
    elif orientation == 6:
        return image.rotate(270, expand=True)
    elif orientation == 8:
        return image.rotate(90, expand=True)
    return image
 if len(os.listdir(INPUT_DIR)):
    print('traitement des images dans pictures inbox', len(os.listdir(INPUT_DIR)))
 # Parcourir toutes les images dans le dossier d'entrée
 for filename in os.listdir(INPUT_DIR):
    file_path = os.path.join(INPUT_DIR, filename)
    # Vérifier si c'est bien un fichier
-    if os.path.isfile(os.path.join(INPUT_DIR, filename)):
+    if os.path.isfile(file_path) and (file_path.lower().endswith('.jpg') or file_path.lower().endswith('.png')):
        # Récupérer le nom de base de l'image et son extension
        base_name = os.path.splitext(filename)
        extension = os.path.splitext(filename)[1].lower()
@ -30,21 +57,29 @@ for filename in os.listdir(INPUT_DIR):
        # Chemins des images
        input_image_path = os.path.join(INPUT_DIR, filename)
        done_image_path = os.path.join(DONE_DIR, filename)
        small_image_path = os.path.join(OUTPUT_DIR, YEAR, small_image_name)
        original_image_path = os.path.join(OUTPUT_DIR, YEAR, filename)
-        # Redimensionner l'image
+        # Ouvrir l'image et appliquer l'orientation
        with Image.open(input_image_path) as img:
-            img = img.resize((600, int(img.height * 600 / img.width)), Image.Resampling.LANCZOS)
+            orientation = get_exif_orientation(img)
            if orientation is not None:
                img = apply_orientation(img, orientation)
            # Redimensionner l'image
            img = img.resize((max_width_resized, int(img.height * max_width_resized / img.width)), Image.Resampling.LANCZOS)
            img.save(small_image_path, 'JPEG')  # Utiliser 'JPEG' au lieu de 'JPG'
        # Copier l'image originale dans le dossier de sortie
        with open(input_image_path, 'rb') as f:
            with open(original_image_path, 'wb') as f_out:
                f_out.write(f.read())
-
+        # déplacer l'image originale dans le dossier des images traitées
        os.rename(input_image_path, done_image_path)
        # Écrire la ligne pour le document .org
-        org_line = f"[[file:wp-uploads/content/i/{YEAR}/{small_image_name}][{filename}]]\n"
+
        org_line = f"\n\n[[file:{url_folder_pics}/{YEAR}/{small_image_name}][{filename}]]\n"
        with open(os.path.join("output", f"images_{YEAR}.org"), "a") as org_file:
            org_file.write(org_line)
		`@ -0,0 +1,2 @@`
							`# copier les images vers le serveur qui héberge les blogs`
							`rsync -avzp output/pictures/* tykayn@proxmox.coussinet.org:/poule/encrypted/www/tykayn-inbox/blogs-output-pictures`