orgmode-to-gemini-blog/utils.py

#!/bin/python3
import os
import re
import shutil
from datetime import datetime
import unicodedata

from website_config import *

# this path should be customized
org_roam_dir: str = '/home/tykayn/Nextcloud/textes/orgmode/org-roam/'

# Trouver l'identifiant OrgROAM
pattern_roam_id_search = r':ID:(?:\s+)?([a-zA-Z0-9-]+)'
# Expression régulière pour extraire la date et le slug du nom de fichier org
regex = r"^(\d{14})(-[a-zA-Z0-9_-]+)\.gmi$"
# Recherche de date de création du fichier org-roam dans un article gemini
regex_orgroam = r"^(\d{14})_([a-zA-Z0-9_-]+)\.gmi$"

# show_logs=True
show_logs = global_config["show_logs"]


def mylog(*content):
    """Fonction qui imprime tous les arguments passés selon le niveau de debug souhaité."""
    if show_logs:
        print(content)


def trouver_nom_article(fichier_org, blog_name, format="html"):
    mylog('fichier_org, ', fichier_org)
    with open(fichier_org, 'r') as file:
        lignes = file.readlines()

    nom_article = ''

    mylog('trouver_nom_article format', format)
    # Expressions régulières pour trouver les titres de niveau 1 et 2
    if format == 'html':
        titre_niveau_1 = r'<h1\s+(?:id|data-created)="[^"]*">(.*?)</h1>'
        titre_niveau_2 = r'^\<h2.*?\>(.+)\<\/h2\>$'
    else:
        titre_niveau_1 = r'^\*+ (.+)$'
        titre_niveau_2 = r'^\*\*+ (.+)$'

    # Itérer sur les lignes du fichier
    for ligne in lignes:
        # Rechercher un titre de niveau 1
        titre_niveau_1_match = re.match(titre_niveau_1, ligne)
        if titre_niveau_1_match:
            titre_niveau_1_texte = titre_niveau_1_match.group(1)
            if titre_niveau_1_texte.lower() != "article" and titre_niveau_1_texte.lower() != "liens":
                nom_article = titre_niveau_1_texte
                break
            else:
                # Si le premier titre de niveau 1 est "Article", rechercher le premier titre de niveau 2
                titre_niveau_2_match = re.match(titre_niveau_2, ligne)
                if titre_niveau_2_match:
                    nom_article = titre_niveau_2_match.group(1)
                    break
    mylog(f"Nom de l'article : {nom_article}")

    return nom_article.replace(blog_name + '_', '').replace('_', ' ')


def find_year_and_slug(fichier):
    fichier = fichier.replace('..', '.')
    mylog(f" ------------ build_indexes: find in {fichier} -------------")
    slug = fichier.replace('.gmi', '').replace('.org', '')
    annee = '2024'
    date_str = '2024-00-00'
    date = '2024-00-00'
    match = re.match(regex_orgroam, fichier)

    if match:
        date_str = match.group(1)
        annee = date_str[:4]
        slug = match.group(2)

    match = re.match(regex, fichier)
    if match:
        date_str = match.group(1)
        # Convertir la date en objet datetime
        if "-" in date_str:
            date = datetime.strptime(date_str, "%Y-%m-%d")
        else:
            date = datetime.strptime(date_str, "%Y%m%d%H%M%S")
        date_string_replaced = str(date).replace(' 00:00:00', '')
        slug = slug.replace(date_string_replaced, '')
        slug = enlever_premier_tiret_ou_underscore(slug)

        annee = str(date.year).replace(' 00:00:00', '')
    # else:
    #     print('ERREUR aucun slug trouvé')

    mylog(f" ------------ build_indexes:  ")
    mylog(f" ------------ build_indexes: Fichier: {fichier}")
    mylog(f" ------------ build_indexes: année: {annee}")
    mylog(f" ------------ build_indexes: str(date): {str(date)}")
    mylog(f" ------------ build_indexes: slug: {slug}")
    mylog(f" ------------ build_indexes: chemin: {annee}/{slug}/")
    return [date_str, annee, slug]


def enlever_premier_tiret_ou_underscore(chaîne):
    if chaîne.startswith('-') or chaîne.startswith('_'):
        chaîne = chaîne[1:]
    return chaîne


# création des dossiers intermédiaires s'il y en a
# déplace le fichier dans le dossier spécifié
def create_path_folders_and_move_file(path, file):
    os.makedirs(os.path.dirname(path), exist_ok=True)

    shutil.move(file, path)


def get_files_list_of_folder(folder_path):
    # Vérifie si le dossier existe
    if not os.path.exists(folder_path):
        print(f" ------------ build_indexes: Erreur : Le dossier '{folder_path}' n'existe pas.")
        return
    mylog('----------- get_files_list_of_folder: folder_path : ', folder_path)
    # Liste les fichiers articles, trie par nom décroissant
    try:
        fichiers_md = sorted(
            [f.replace('.' + global_config['source_files_extension'], '.gmi') for f in os.listdir(folder_path) if
             f.endswith(global_config['source_files_extension'])], reverse=True)
        print('fichiers trouvés:', len(fichiers_md))
        return fichiers_md
    except OSError as e:
        print(f" ------------ build_indexes: Erreur lors de la lecture du dossier : {e}")
        return


def get_id_of_roam_note_content(content):
    match = re.search(pattern_roam_id_search, content)
    if match:
        return match.group(1)
    return None


def find_first_level1_title(content):
    pattern = r'^\* (.+)$'
    match = re.search(pattern, content, re.MULTILINE)
    if match:
        if match.group(1) != 'Article':
            return match.group(1)
        else:
            pattern = r'^\*\* (.+)$'
            match = re.search(pattern, content, re.MULTILINE)
            if match:
                return match.group(1)
    return None

def find_extract_in_content_org(org_content):
    # Supprimer les lignes qui commencent par #+
    org_content = re.sub(r'^\s*#\+.*\n', '', org_content, flags=re.MULTILINE)

    # Supprimer les sections de logbook
    org_content = re.sub(r'^\*\* Logbook\n.*?(?=\*\* |\Z)', '', org_content, flags=re.DOTALL | re.MULTILINE)

    # Supprimer les propriétés
    org_content = re.sub(r'^:PROPERTIES:\n.*?:END:\n', '', org_content, flags=re.DOTALL | re.MULTILINE)

    # Supprimer les lignes vides supplémentaires
    org_content = re.sub(r'\n\s*\n+', '\n', org_content)

    # Supprimer les espaces en début et fin de chaque ligne
    org_content = '\n'.join(line.strip() for line in org_content.splitlines())

    # Supprimer les espaces en début et fin du contenu final
    return org_content.strip()

def extract_body_content(html_content):
    pattern = r'<body.*?>(.*?)</body>'
    match = re.search(pattern, html_content, re.DOTALL)
    if match:
        return match.group(1)
    else:
        print('---- extract_body_content : no body found in this html')
        return html_content


def remove_properties_section(text):
    pattern = r"<h1 id=\"article\">Article</h1>.+?</ul>"
    replacement = ""
    return re.sub(pattern, replacement, text, flags=re.DOTALL)


def remove_article_head_properties_orgmode(text):
    pattern = r":PROPERTIES:.+?:END:"
    replacement = ""
    return re.sub(pattern, replacement, text, flags=re.DOTALL)


def remove_hint_html(text):
    pattern = r"<p>ceci<sub>estduhtml</sub></p>"
    replacement = ""
    return re.sub(pattern, replacement, text, flags=re.DOTALL)


def slugify_title(title_text):
    """
    Transforme un titre en un slug valide.

    :param title_text: Titre en texte (str).
    :return: Slug en minuscules avec des tirets (str).
    """
    title_text = unicodedata.normalize('NFKD', title_text).encode('ascii', 'ignore').decode('ascii')
    title_text = title_text.lower()
    title_text = re.sub(r'[^a-z0-9\s-]', '', title_text)
    title_text = re.sub(r'\s+', '-', title_text)
    title_text = re.sub(r'-+', '-', title_text)
    title_text = title_text.strip('-')
    return title_text

def find_slug_in_file_basename(file_basename) -> str:
    """
    Extrait l'année et le slug du nom de fichier selon le format spécifié.

    :param file_basename: Nom de fichier (str).
    :return: Tuple contenant l'année et le slug (année, slug) ou None si non trouvé.
    """
    pattern = r'^(\d{4})\d{10}(.+)\.org$'
    match = re.match(pattern, file_basename)
    if match:
        year = match.group(1)
        slug = match.group(2)
        # prendre la partie finale du nom du fichier
        splitted = slug.split('_')
        # print('len(splitted)', len(splitted), splitted)
        if len(splitted) > 1:
            slug = splitted[len(splitted)-1]

        # final_slug=slug.replace("_cipherbliss_blog_","")
        # final_slug=final_slug.replace("_blog_cil_gometz_","")
        slug=enlever_premier_tiret_ou_underscore(slug)

        slug = f"{year}/{slug}"


        return slug
    return None
refacto 2024-11-14 13:32:56 +01:00			`#!/bin/python3`
up utils and global conf 2024-11-15 15:56:11 +01:00			`import os`
refacto 2024-11-14 13:32:56 +01:00			`import re`
up utils and global conf 2024-11-15 15:56:11 +01:00			`import shutil`
			`from datetime import datetime`
unify slugs 2024-11-15 23:55:20 +01:00			`import unicodedata`
up utils and global conf 2024-11-15 15:56:11 +01:00
			`from website_config import *`
refacto 2024-11-14 13:32:56 +01:00
			`# this path should be customized`
			`org_roam_dir: str = '/home/tykayn/Nextcloud/textes/orgmode/org-roam/'`

up utils and global conf 2024-11-15 15:56:11 +01:00			`# Trouver l'identifiant OrgROAM`
refacto 2024-11-14 13:32:56 +01:00			`pattern_roam_id_search = r':ID:(?:\s+)?([a-zA-Z0-9-]+)'`
up utils and global conf 2024-11-15 15:56:11 +01:00			`# Expression régulière pour extraire la date et le slug du nom de fichier org`
			`regex = r"^(\d{14})(-[a-zA-Z0-9_-]+)\.gmi$"`
			`# Recherche de date de création du fichier org-roam dans un article gemini`
			`regex_orgroam = r"^(\d{14})_([a-zA-Z0-9_-]+)\.gmi$"`

			`# show_logs=True`
			`show_logs = global_config["show_logs"]`


			`def mylog(*content):`
			`"""Fonction qui imprime tous les arguments passés selon le niveau de debug souhaité."""`
			`if show_logs:`
			`print(content)`


			`def trouver_nom_article(fichier_org, blog_name, format="html"):`
			`mylog('fichier_org, ', fichier_org)`
			`with open(fichier_org, 'r') as file:`
			`lignes = file.readlines()`

			`nom_article = ''`

			`mylog('trouver_nom_article format', format)`
			`# Expressions régulières pour trouver les titres de niveau 1 et 2`
			`if format == 'html':`
			`titre_niveau_1 = r'<h1\s+(?:id\|data-created)="[^"]">(.?)</h1>'`
			`titre_niveau_2 = r'^\<h2.*?\>(.+)\<\/h2\>$'`
			`else:`
			`titre_niveau_1 = r'^\*+ (.+)$'`
			`titre_niveau_2 = r'^\\+ (.+)$'`

			`# Itérer sur les lignes du fichier`
			`for ligne in lignes:`
			`# Rechercher un titre de niveau 1`
			`titre_niveau_1_match = re.match(titre_niveau_1, ligne)`
			`if titre_niveau_1_match:`
			`titre_niveau_1_texte = titre_niveau_1_match.group(1)`
			`if titre_niveau_1_texte.lower() != "article" and titre_niveau_1_texte.lower() != "liens":`
			`nom_article = titre_niveau_1_texte`
			`break`
			`else:`
			`# Si le premier titre de niveau 1 est "Article", rechercher le premier titre de niveau 2`
			`titre_niveau_2_match = re.match(titre_niveau_2, ligne)`
			`if titre_niveau_2_match:`
			`nom_article = titre_niveau_2_match.group(1)`
			`break`
			`mylog(f"Nom de l'article : {nom_article}")`

			`return nom_article.replace(blog_name + '_', '').replace('_', ' ')`


			`def find_year_and_slug(fichier):`
			`fichier = fichier.replace('..', '.')`
			`mylog(f" ------------ build_indexes: find in {fichier} -------------")`
up atom feed generate 2024-11-18 11:18:50 +01:00			`slug = fichier.replace('.gmi', '').replace('.org', '')`
up utils and global conf 2024-11-15 15:56:11 +01:00			`annee = '2024'`
			`date_str = '2024-00-00'`
			`date = '2024-00-00'`
			`match = re.match(regex_orgroam, fichier)`

			`if match:`
			`date_str = match.group(1)`
			`annee = date_str[:4]`
			`slug = match.group(2)`

			`match = re.match(regex, fichier)`
			`if match:`
			`date_str = match.group(1)`
			`# Convertir la date en objet datetime`
			`if "-" in date_str:`
			`date = datetime.strptime(date_str, "%Y-%m-%d")`
			`else:`
			`date = datetime.strptime(date_str, "%Y%m%d%H%M%S")`
			`date_string_replaced = str(date).replace(' 00:00:00', '')`
			`slug = slug.replace(date_string_replaced, '')`
			`slug = enlever_premier_tiret_ou_underscore(slug)`

			`annee = str(date.year).replace(' 00:00:00', '')`
			`# else:`
			`# print('ERREUR aucun slug trouvé')`

			`mylog(f" ------------ build_indexes: ")`
			`mylog(f" ------------ build_indexes: Fichier: {fichier}")`
			`mylog(f" ------------ build_indexes: année: {annee}")`
			`mylog(f" ------------ build_indexes: str(date): {str(date)}")`
			`mylog(f" ------------ build_indexes: slug: {slug}")`
			`mylog(f" ------------ build_indexes: chemin: {annee}/{slug}/")`
			`return [date_str, annee, slug]`


			`def enlever_premier_tiret_ou_underscore(chaîne):`
			`if chaîne.startswith('-') or chaîne.startswith('_'):`
			`chaîne = chaîne[1:]`
			`return chaîne`


			`# création des dossiers intermédiaires s'il y en a`
			`# déplace le fichier dans le dossier spécifié`
			`def create_path_folders_and_move_file(path, file):`
			`os.makedirs(os.path.dirname(path), exist_ok=True)`

			`shutil.move(file, path)`


			`def get_files_list_of_folder(folder_path):`
			`# Vérifie si le dossier existe`
			`if not os.path.exists(folder_path):`
			`print(f" ------------ build_indexes: Erreur : Le dossier '{folder_path}' n'existe pas.")`
			`return`
			`mylog('----------- get_files_list_of_folder: folder_path : ', folder_path)`
			`# Liste les fichiers articles, trie par nom décroissant`
			`try:`
			`fichiers_md = sorted(`
up detection of slug 2024-11-15 16:24:31 +01:00			`[f.replace('.' + global_config['source_files_extension'], '.gmi') for f in os.listdir(folder_path) if`
			`f.endswith(global_config['source_files_extension'])], reverse=True)`
up utils and global conf 2024-11-15 15:56:11 +01:00			`print('fichiers trouvés:', len(fichiers_md))`
			`return fichiers_md`
			`except OSError as e:`
			`print(f" ------------ build_indexes: Erreur lors de la lecture du dossier : {e}")`
			`return`
refacto 2024-11-14 13:32:56 +01:00
up style 2024-11-14 16:22:34 +01:00
refacto 2024-11-14 13:32:56 +01:00			`def get_id_of_roam_note_content(content):`
			`match = re.search(pattern_roam_id_search, content)`
			`if match:`
			`return match.group(1)`
			`return None`

up style 2024-11-14 16:22:34 +01:00
refacto 2024-11-14 13:32:56 +01:00			`def find_first_level1_title(content):`
			`pattern = r'^\* (.+)$'`
			`match = re.search(pattern, content, re.MULTILINE)`
			`if match:`
			`if match.group(1) != 'Article':`
			`return match.group(1)`
			`else:`
			`pattern = r'^\\ (.+)$'`
			`match = re.search(pattern, content, re.MULTILINE)`
			`if match:`
			`return match.group(1)`
			`return None`

up atom feed generate 2024-11-18 11:18:50 +01:00			`def find_extract_in_content_org(org_content):`
			`# Supprimer les lignes qui commencent par #+`
			`org_content = re.sub(r'^\s#\+.\n', '', org_content, flags=re.MULTILINE)`

			`# Supprimer les sections de logbook`
			`org_content = re.sub(r'^\\ Logbook\n.?(?=\\* \|\Z)', '', org_content, flags=re.DOTALL \| re.MULTILINE)`

			`# Supprimer les propriétés`
			`org_content = re.sub(r'^:PROPERTIES:\n.*?:END:\n', '', org_content, flags=re.DOTALL \| re.MULTILINE)`

			`# Supprimer les lignes vides supplémentaires`
			`org_content = re.sub(r'\n\s*\n+', '\n', org_content)`

			`# Supprimer les espaces en début et fin de chaque ligne`
			`org_content = '\n'.join(line.strip() for line in org_content.splitlines())`

			`# Supprimer les espaces en début et fin du contenu final`
			`return org_content.strip()`
up style 2024-11-14 16:22:34 +01:00
			`def extract_body_content(html_content):`
up utils and global conf 2024-11-15 15:56:11 +01:00			`pattern = r'<body.?>(.?)</body>'`
up style 2024-11-14 16:22:34 +01:00			`match = re.search(pattern, html_content, re.DOTALL)`
			`if match:`
			`return match.group(1)`
			`else:`
up tkblog source 2024-11-15 01:45:11 +01:00			`print('---- extract_body_content : no body found in this html')`
			`return html_content`
up style 2024-11-14 16:22:34 +01:00

			`def remove_properties_section(text):`
			`pattern = r"<h1 id=\"article\">Article</h1>.+?</ul>"`
			`replacement = ""`
			`return re.sub(pattern, replacement, text, flags=re.DOTALL)`


			`def remove_article_head_properties_orgmode(text):`
			`pattern = r":PROPERTIES:.+?:END:"`
			`replacement = ""`
			`return re.sub(pattern, replacement, text, flags=re.DOTALL)`


			`def remove_hint_html(text):`
			`pattern = r"<p>ceci<sub>estduhtml</sub></p>"`
			`replacement = ""`
			`return re.sub(pattern, replacement, text, flags=re.DOTALL)`
up detection of slug 2024-11-15 16:24:31 +01:00

unify slugs 2024-11-15 23:55:20 +01:00			`def slugify_title(title_text):`
			`"""`
			`Transforme un titre en un slug valide.`

			`:param title_text: Titre en texte (str).`
			`:return: Slug en minuscules avec des tirets (str).`
			`"""`
			`title_text = unicodedata.normalize('NFKD', title_text).encode('ascii', 'ignore').decode('ascii')`
			`title_text = title_text.lower()`
			`title_text = re.sub(r'[^a-z0-9\s-]', '', title_text)`
			`title_text = re.sub(r'\s+', '-', title_text)`
			`title_text = re.sub(r'-+', '-', title_text)`
			`title_text = title_text.strip('-')`
			`return title_text`

up atom feed generate 2024-11-18 11:18:50 +01:00			`def find_slug_in_file_basename(file_basename) -> str:`
up detection of slug 2024-11-15 16:24:31 +01:00			`"""`
			`Extrait l'année et le slug du nom de fichier selon le format spécifié.`

			`:param file_basename: Nom de fichier (str).`
			`:return: Tuple contenant l'année et le slug (année, slug) ou None si non trouvé.`
			`"""`
unify slugs 2024-11-15 23:55:20 +01:00			`pattern = r'^(\d{4})\d{10}(.+)\.org$'`
up detection of slug 2024-11-15 16:24:31 +01:00			`match = re.match(pattern, file_basename)`
			`if match:`
			`year = match.group(1)`
			`slug = match.group(2)`
unify slugs 2024-11-15 23:55:20 +01:00			`# prendre la partie finale du nom du fichier`
			`splitted = slug.split('_')`
			`# print('len(splitted)', len(splitted), splitted)`
			`if len(splitted) > 1:`
			`slug = splitted[len(splitted)-1]`

			`# final_slug=slug.replace("_cipherbliss_blog_","")`
			`# final_slug=final_slug.replace("_blog_cil_gometz_","")`
			`slug=enlever_premier_tiret_ou_underscore(slug)`
up detection of slug 2024-11-15 16:24:31 +01:00
unify slugs 2024-11-15 23:55:20 +01:00			`slug = f"{year}/{slug}"`



			`return slug`
up detection of slug 2024-11-15 16:24:31 +01:00			`return None`
unify slugs 2024-11-15 23:55:20 +01:00