add gather data

2024-11-20 00:24:09 +01:00 · 2024-11-20 00:24:09 +01:00 · 87d27dc8a2
commit 87d27dc8a2
parent baac2fd2f1
5 changed files with 131 additions and 66 deletions
--- a/gather_tags_in_json.py
+++ b/gather_tags_in_json.py
@ -18,7 +18,7 @@ output_file = f'sources/{blog_folder}/converted/tags.json'  # Fichier de sortie
 html_output_folder = f'html-websites/{blog_folder}/tag'  # Dossier de sortie pour les fichiers HTML
 html_output_folder_index = f'html-websites/{blog_folder}/tags'  # Dossier de sortie pour les fichiers HTML
 excluded_tags = {'PROPERTIES', 'CREATED', 'ID', 'END'}
-automatic_tagging_enabled = True
+automatic_tagging_enabled = global_config['automatic_tagging_enabled']

 count_not_tagged_files = 0
 count_orgfiles = 0
@ -38,62 +38,10 @@ def find_org_files(directory):
    return org_files


-def add_tags_from_content(tags=None, file_content="", words_to_check=None):
-    """
-    Ajoute des tags à l'ensemble `tags` si les mots correspondants sont trouvés dans le contenu du fichier.
-
-    :param tags: Ensemble de tags (set). Si None, un nouvel ensemble est créé (type set, optionnel).
-    :param file_content: Contenu du fichier (str).
-    :param words_to_check: Liste de mots à repérer (list). Si None, une liste vide est utilisée (type list, optionnel).
-    :return: Ensemble de tags mis à jour (set).
-    """
-    # Initialiser l'ensemble tags s'il est None
-    if tags is None:
-        tags = set()
-
-    # Initialiser la liste words_to_check s'il est None
-    if words_to_check is None:
-        words_to_check = []
-
-    # Convertir le contenu du fichier en minuscules pour une recherche insensible à la casse
-    file_content_lower = file_content.lower()
-
-    # Parcourir chaque mot à vérifier
-    for word in words_to_check:
-        # Vérifier si le mot est présent dans le contenu du fichier
-        if word.lower() in file_content_lower:
-            # Ajouter le tag correspondant à l'ensemble de tags
-            tags.add(word)
-
-    return tags


-def extract_tags_from_file(file_path, excluded_tags):
-    tags = set()
-    with open(file_path, 'r', encoding='utf-8') as file_content:
-        tag_found = False
-        for line in file_content:
-            if automatic_tagging_enabled:
-                tags = add_tags_from_content(tags, line, global_config['auto_tag_terms'])
-            # Check for orgmode tags :tag1:tag2:
-            if ':' in line:
-                for word in line.split():
-                    if len(word) and word.startswith(':') and word.endswith(':'):
-                        tag = word[1:-1]
-                        if tag not in excluded_tags:
-                            tags.add(tag)
-                            tag_found = True
-            # Check for #+tags: tag1,tag2
-            if line.startswith('#+tags:'):
-                for tag in line[len('#+tags:'):].split(','):
-                    tag = tag.strip()
-                    if tag and tag not in excluded_tags:
-                        tags.add(tag)
-                        tag_found = True

-    if not tag_found:
-        print('no tag in the article', file_path)
-    return tags
+


 def group_files_by_tags(org_files, excluded_tags):
--- a/testing.py
+++ b/testing.py
@ -4,6 +4,66 @@
 from utils import *
 from website_config import configs_sites

+# trouver les articles précédents et suivants
+import os
+import re
+
+# Fonction pour extraire le basename d'un fichier
+def get_basename(file_name):
+    return os.path.splitext(file_name)[0]
+
+# Chemin du dossier contenant les fichiers orgmode
+directory = 'sources/qzine_blog/lang_fr'
+
+# Dictionnaire pour stocker les informations des fichiers
+files_dict = {}
+
+# Parcourir les fichiers du dossier
+for file_name in os.listdir(directory):
+    if file_name.endswith('.org'):
+        file_path = os.path.join(directory, file_name)
+        with open(file_path, "r", encoding="utf-8") as f:
+            content = f.read()
+            basename = get_basename(file_name)
+            date_str, annee, slug = find_year_and_slug_on_filename(basename)
+            tags = extract_tags_from_file(file_path, global_config['excluded_tags'])
+            boom = basename.split('__')
+            title = find_first_level1_title(content)
+            files_dict[f"{annee}/{slug}"] = {
+                'path': file_path,
+                'basename': basename,
+                'slug': slug,
+                'slug_with_year': f"{annee}/{slug}",
+                'date': boom[0],
+                'annee': annee,
+                'tags': tags,
+                'title': title,
+                'next': None,
+                'previous': None
+            }
+
+# Trier les basenames par ordre décroissant
+sorted_basenames = sorted(files_dict.keys(), reverse=True)
+
+# Ajouter les noms des articles suivant et précédent
+for i in range(len(sorted_basenames)):
+    basename = sorted_basenames[i]
+    if i > 0:
+        files_dict[basename]['previous'] = sorted_basenames[i - 1]
+    if i < len(sorted_basenames) - 1:
+        files_dict[basename]['next'] = sorted_basenames[i + 1]
+
+# Afficher le dictionnaire pour vérification
+for basename, info in files_dict.items():
+    print(f"Article: {basename}")
+    print(f"  Path: {info['path']}")
+    print(f"  tags: {info['tags']}")
+    print(f"  title: {info['title']}")
+    print(f"  Previous: {info['previous']}")
+    print(f"  Next: {info['next']}")
+    print("-" * 40)
+
+
 # parser = argparse.ArgumentParser(description="Générer un site Web à partir de fichiers HTML.")
 # parser.add_argument("blog_name", help="Le chemin vers le dossier contenant les fichiers HTML.")
 # parser.add_argument("--title", "-t", default="Mon site Web", help="Le titre du site Web.")
@ -32,6 +92,6 @@ from website_config import configs_sites
 # trouver le bon slug avec l'année

 # fichier = 'sources/tykayn_blog/lang_fr/20170708T095535_tkblog_918_7-techniques-geniales-pour-maximiser-son-malheur-au-quotidien.org'
-fichier = '20170708T095535__techniques-geniales-pour-maximiser-son-malheur-au-quotidien.org'
-datestr, slug, path = find_year_and_slug_on_filename(fichier)
-print(datestr, slug, path)
+# fichier = '20170708T095535__techniques-geniales-pour-maximiser-son-malheur-au-quotidien.org'
+# datestr, slug, path = find_year_and_slug_on_filename(fichier)
+# print(datestr, slug, path)
--- a/update_on_server.sh
+++ b/update_on_server.sh
@ -6,7 +6,7 @@ racine_sites_statiques="/poule/encrypted/www/"
 images_inbox="/poule/encrypted/www/tykayn-inbox/blogs-output-pictures"
 images_destination="/poule/encrypted/www/tykayn.fr/wp-content/uploads/i/"
 # on part du principe que ce dépot est cloné sur le serveur pour simplifier la mise à jour
-racine_depot_git="/poule/encrypted/www/orgmode-to-gemini-blog"
+racine_depot_git="/home/poule/encrypted/stockage-syncable/www/development/html/orgmode-to-gemini-blog"

 # on copie les fichiers générés dans les dépots servis pour chaque nom de domaine

@ -31,10 +31,10 @@ mv "$images_inbox/*" $images_destination
 #cp $racine_depot_git/html-websites/dragonfeu_blog/lang_fr/ $racine_sites_statiques/dragonfeu_blog -r
 #cp $racine_depot_git/html-websites/dragonfeu_blog/lang_en/ $racine_sites_statiques/dragonfeu_blog -r

-rsync -a /poule/encrypted/www/tykayn-inbox/cipherbliss_blog/* /poule/encrypted/www/cipherbliss.com/
-rsync -a /poule/encrypted/www/tykayn-inbox/tykayn_blog/* /poule/encrypted/www/tykayn.fr/
-rsync -a /poule/encrypted/www/tykayn-inbox/qzine_blog/* /poule/encrypted/www/qzine.fr/
-
-chown -R www-data:www-data /poule/encrypted/www/tykayn.fr
-chown -R www-data:www-data /poule/encrypted/www/cipherbliss.com
-chown -R www-data:www-data /poule/encrypted/www/qzine.fr
+#rsync -a /poule/encrypted/www/tykayn-inbox/cipherbliss_blog/* /poule/encrypted/www/cipherbliss.com/
+#rsync -a /poule/encrypted/www/tykayn-inbox/tykayn_blog/* /poule/encrypted/www/tykayn.fr/
+#rsync -a /poule/encrypted/www/tykayn-inbox/qzine_blog/* /poule/encrypted/www/qzine.fr/
+#
+#chown -R www-data:www-data /poule/encrypted/www/tykayn.fr
+#chown -R www-data:www-data /poule/encrypted/www/cipherbliss.com
+#chown -R www-data:www-data /poule/encrypted/www/qzine.fr
--- a/utils.py
+++ b/utils.py
@ -173,6 +173,61 @@ def extract_body_content(html_content):
        print('---- extract_body_content : no body found in this html')
        return html_content

+def add_tags_from_content(tags=None, file_content="", words_to_check=None):
+    """
+    Ajoute des tags à l'ensemble `tags` si les mots correspondants sont trouvés dans le contenu du fichier.
+
+    :param tags: Ensemble de tags (set). Si None, un nouvel ensemble est créé (type set, optionnel).
+    :param file_content: Contenu du fichier (str).
+    :param words_to_check: Liste de mots à repérer (list). Si None, une liste vide est utilisée (type list, optionnel).
+    :return: Ensemble de tags mis à jour (set).
+    """
+    # Initialiser l'ensemble tags s'il est None
+    if tags is None:
+        tags = set()
+
+    # Initialiser la liste words_to_check s'il est None
+    if words_to_check is None:
+        words_to_check = []
+
+    # Convertir le contenu du fichier en minuscules pour une recherche insensible à la casse
+    file_content_lower = file_content.lower()
+
+    # Parcourir chaque mot à vérifier
+    for word in words_to_check:
+        # Vérifier si le mot est présent dans le contenu du fichier
+        if word.lower() in file_content_lower:
+            # Ajouter le tag correspondant à l'ensemble de tags
+            tags.add(word)
+
+    return tags
+
+def extract_tags_from_file(file_path, excluded_tags):
+    tags = set()
+    with open(file_path, 'r', encoding='utf-8') as file_content:
+        tag_found = False
+        for line in file_content:
+            if global_config['automatic_tagging_enabled']:
+                tags = add_tags_from_content(tags, line, global_config['auto_tag_terms'])
+            # Check for orgmode tags :tag1:tag2:
+            if ':' in line:
+                for word in line.split():
+                    if len(word) and word.startswith(':') and word.endswith(':'):
+                        tag = word[1:-1]
+                        if tag not in excluded_tags:
+                            tags.add(tag)
+                            tag_found = True
+            # Check for #+tags: tag1,tag2
+            if line.startswith('#+tags:'):
+                for tag in line[len('#+tags:'):].split(','):
+                    tag = tag.strip()
+                    if tag and tag not in excluded_tags:
+                        tags.add(tag)
+                        tag_found = True
+
+    if not tag_found:
+        print('no tag in the article', file_path)
+    return tags

 def remove_properties_section(text):
    pattern = r"<h1 id=\"article\">Article</h1>.+?</ul>"
--- a/website_config.py
+++ b/website_config.py
@ -4,11 +4,13 @@ global_config = {
    "slug_with_year": True,
    # "show_logs": False,
    "show_logs": True,
+    "automatic_tagging_enabled": True,
    "rebuild_files_filter": 2024,
    "source_files_extension": "org",
+    "excluded_tags": ['PROPERTIES', 'CREATED', 'ID', 'END','CUSTOM_ID'],
    # controlled vocabulary to find tags automatically
    "auto_tag_terms": ["illustration", "tuto", "nsfw", "bd", "récit", "science",
-                       "wtf", "yaoi", "yuri", "sondage", "entreprise", "ai", "photos",
+                       "wtf", "yaoi", "yuri", "sondage", "entreprise", "AI", "photos",
                       "cosplay", "festival", "fanzine", "manif", "logiciel", "inktober",
                       "kotlife", "féminisme", "fantasme", "art", "sociologie", "couple", "masturbation",
                       "boobs", "sortirDesFossiles", "électrique", "maison", "GTD", "chat", "PIM","mastoart",