orgmode-to-gemini-blog/utils.py

#!/bin/python3
import re

# this path should be customized
org_roam_dir: str = '/home/tykayn/Nextcloud/textes/orgmode/org-roam/'

pattern_roam_id_search = r':ID:(?:\s+)?([a-zA-Z0-9-]+)'


def get_id_of_roam_note_content(content):
    match = re.search(pattern_roam_id_search, content)
    if match:
        return match.group(1)
    return None


def find_first_level1_title(content):
    pattern = r'^\* (.+)$'
    match = re.search(pattern, content, re.MULTILINE)
    if match:
        if match.group(1) != 'Article':
            return match.group(1)
        else:
            pattern = r'^\*\* (.+)$'
            match = re.search(pattern, content, re.MULTILINE)
            if match:
                return match.group(1)
    return None


def extract_body_content(html_content):
    pattern = r'<body[^>]*?>(.*?)</body>'
    match = re.search(pattern, html_content, re.DOTALL)
    if match:
        return match.group(1)
    else:
        print('---- extract_body_content : no body found in this html')
        return html_content


def remove_properties_section(text):
    pattern = r"<h1 id=\"article\">Article</h1>.+?</ul>"
    replacement = ""
    return re.sub(pattern, replacement, text, flags=re.DOTALL)


def remove_article_head_properties_orgmode(text):
    pattern = r":PROPERTIES:.+?:END:"
    replacement = ""
    return re.sub(pattern, replacement, text, flags=re.DOTALL)


def remove_hint_html(text):
    pattern = r"<p>ceci<sub>estduhtml</sub></p>"
    replacement = ""
    return re.sub(pattern, replacement, text, flags=re.DOTALL)