171 lines
6.6 KiB
Python
171 lines
6.6 KiB
Python
|
import sys
|
|||
|
import os
|
|||
|
sys.path.append("../md2gemini")
|
|||
|
import shutil
|
|||
|
from md2gemini import md2gemini
|
|||
|
from pathlib import Path
|
|||
|
import html
|
|||
|
import unicodedata
|
|||
|
import urllib
|
|||
|
|
|||
|
|
|||
|
mdsource = "/home/ploum/gandi_backup_202210119/blog_20221019"
|
|||
|
filesource = "/home/ploum/gandi_backup_202210119/vhosts/ploum.net/htdocs"
|
|||
|
filedest = "/home/ploum/dev/gemlog"
|
|||
|
|
|||
|
def convertlink(inlink):
|
|||
|
localfolder = ""
|
|||
|
writtenlink = ""
|
|||
|
if inlink.startswith("http://ploum.net") or inlink.startswith("https://ploum.net"):
|
|||
|
inlink = inlink.removeprefix("http://ploum.net").removeprefix("https://ploum.net")
|
|||
|
if not inlink.startswith("http"):
|
|||
|
if inlink.startswith("/post/"):
|
|||
|
inlink = inlink.removeprefix("/post")
|
|||
|
elif inlink.startswith("/public/"):
|
|||
|
inlink = "/wp-content/uploads/" + inlink.removeprefix("/public/")
|
|||
|
elif inlink.startswith("/images/"):
|
|||
|
inlink = "/wp-content/uploads/" + inlink.removeprefix("/images/")
|
|||
|
elif inlink.startswith("../uploads"):
|
|||
|
inlink = "/wp-content/" + inlink.removeprefix("../")
|
|||
|
if inlink.startswith("wp-content/") \
|
|||
|
or inlink.startswith("/wp-content/"):
|
|||
|
inlink = inlink.removeprefix("/")
|
|||
|
inlink = urllib.parse.unquote(inlink)
|
|||
|
p = Path(filesource + "/" + inlink)
|
|||
|
inlink = "../files/old/" + inlink.removeprefix("wp-content/uploads/")
|
|||
|
if p.exists():
|
|||
|
dest = Path(filedest + inlink[2:])
|
|||
|
print("copying %s file to %s"%(p,dest))
|
|||
|
if not dest.exists():
|
|||
|
if not dest.parent.exists():
|
|||
|
os.makedirs(dest.parent)
|
|||
|
shutil.move(p,dest)
|
|||
|
|
|||
|
elif "ploum" in inlink or "plus.google.com" in inlink:
|
|||
|
if "medium.com" in inlink or "facebook.com" in inlink or "twitter.com" in inlink or\
|
|||
|
"patreon.com" in inlink or "last.fm" in inlink or "tipeee.com" in inlink or\
|
|||
|
"klout.com" in inlink or "flattr.com" in inlink or "app.net" in inlink or\
|
|||
|
"500px.com" in inlink or "instagram.com" in inlink or "changetip.com" in inlink or\
|
|||
|
"linkedin.com" in inlink or "getpocket.com" in inlink or "tip.me" in inlink or\
|
|||
|
"lastfm.fr" in inlink or "plus.google.com" in inlink:
|
|||
|
inlink = "/no-proprietary-service.html"
|
|||
|
return inlink
|
|||
|
|
|||
|
pngs = 0
|
|||
|
jpgs = 0
|
|||
|
noh = 0
|
|||
|
other = 0
|
|||
|
#print(mdsource)
|
|||
|
testfile = "de-la-mediterranee-a-latlantique-en-vtt.md"
|
|||
|
testlist = [testfile]
|
|||
|
#print(testlist)
|
|||
|
#for md_file in testlist:
|
|||
|
for md_file in os.listdir(mdsource):
|
|||
|
title = ""
|
|||
|
date = ""
|
|||
|
permalink = ""
|
|||
|
text = ""
|
|||
|
tag = []
|
|||
|
lang = "fr"
|
|||
|
image = None
|
|||
|
with open(mdsource + "/" + md_file) as f:
|
|||
|
md = html.unescape(f.read())
|
|||
|
md = unicodedata.normalize('NFC', md)
|
|||
|
md = md.replace("</div>","\n")
|
|||
|
md = md.replace("<figcaption>"," ")
|
|||
|
md = md.replace("</figcaption>","\n")
|
|||
|
md = md.replace("<figure class=\"aligncenter\" size-large is-resized>","")
|
|||
|
md = md.replace("<figure class=\"aligncenter is-resized\">","")
|
|||
|
md = md.replace("<figure class=\"aligncenter\">","")
|
|||
|
md = md.replace("<figure class=\"wp-block-image\">","")
|
|||
|
md = md.replace("</figure>","\n")
|
|||
|
md = md.replace("<div class=\"wp-block-image\">","")
|
|||
|
if "<figure class" in md:
|
|||
|
print("figure class in %s"%md_file)
|
|||
|
f.close()
|
|||
|
header = md.split("---")[1].split("\n")
|
|||
|
intag = False
|
|||
|
for line in header:
|
|||
|
if intag:
|
|||
|
if line.strip().startswith("-"):
|
|||
|
tag.append(line.strip(" -"))
|
|||
|
else:
|
|||
|
intag = False
|
|||
|
if line.startswith("title:"):
|
|||
|
title = html.unescape(line.removeprefix("title:").strip("\" '")).replace("\_"," ")
|
|||
|
elif line.startswith("date"):
|
|||
|
date = line.removeprefix("date: ").strip("\' ").split("T")[0]
|
|||
|
elif line.startswith("permalink:"):
|
|||
|
permalink = line.removeprefix("permalink: /").strip(" ")
|
|||
|
elif line.startswith("tag"):
|
|||
|
intag = True
|
|||
|
elif line.startswith("thumbnail:"):
|
|||
|
image = line.removeprefix("thumbnail: ../uploads/").strip()
|
|||
|
image_folder = filesource + "/wp-content/uploads/"
|
|||
|
image = image.replace("-150x150","")
|
|||
|
p = Path(image_folder+image)
|
|||
|
if p.exists():
|
|||
|
dest = Path(filedest + "/files/old/"+image)
|
|||
|
print("copying %s to %s"%(p,dest))
|
|||
|
if not dest.parent.exists():
|
|||
|
os.makedirs(dest.parent)
|
|||
|
shutil.move(p,dest)
|
|||
|
|
|||
|
if "EN" in tag or "en" in tag:
|
|||
|
lang = "en"
|
|||
|
|
|||
|
path = "%s/old/%s-%s.gmi"%(lang,date,permalink)
|
|||
|
text = md2gemini(md,links="copy",img_tag="",plain=True,strip_html=True,\
|
|||
|
frontmatter=True,link_func=convertlink).replace("\r","")
|
|||
|
#We will now try to identify doublelinks (when an image was linking to itself)
|
|||
|
# and remove those
|
|||
|
lines = text.split("\n")
|
|||
|
newtext = ""
|
|||
|
previousimg = None
|
|||
|
previous = None
|
|||
|
def get_imagename(link):
|
|||
|
imagename = None
|
|||
|
if link.startswith("=> "):
|
|||
|
link = link.split(" ")[1]
|
|||
|
if link.endswith(".jpg") or link.endswith(".png") or link.endswith(".gif"):
|
|||
|
imagename = link[:-4]
|
|||
|
elif link.endswith(".jpeg"):
|
|||
|
imagename = link[:-5]
|
|||
|
if imagename:
|
|||
|
imagename = urllib.parse.unquote(imagename)
|
|||
|
return imagename
|
|||
|
for l in lines:
|
|||
|
if l.startswith("=> ../files/old"):
|
|||
|
imagename = get_imagename(l)
|
|||
|
if previousimg and imagename and (previousimg in imagename or imagename in previousimg):
|
|||
|
#print("skip link %s" %l)
|
|||
|
pass
|
|||
|
else:
|
|||
|
previousimg = imagename
|
|||
|
if imagename:
|
|||
|
extension = l.split()[1].split(".")[-1]
|
|||
|
pathimg = Path(filedest+imagename[2:]+"."+extension)
|
|||
|
if not pathimg.exists():
|
|||
|
print("not imported in %s yet : %s" %(path,pathimg))
|
|||
|
newtext += l + "\n"
|
|||
|
## detecting other double links
|
|||
|
elif l.startswith("=> ") and not "no-proprietary-service" in l:
|
|||
|
if previous and l.split()[1] == previous.split()[1]:
|
|||
|
#print("%s : skip link %s" %(date,l))
|
|||
|
pass
|
|||
|
else:
|
|||
|
previous = l
|
|||
|
newtext += l + "\n"
|
|||
|
else:
|
|||
|
newtext += l + "\n"
|
|||
|
|
|||
|
final = "# %s\n" %title
|
|||
|
if image:
|
|||
|
final += "=> files/old/%s\n"%image
|
|||
|
final += "\n"
|
|||
|
final += newtext
|
|||
|
#print(final)
|
|||
|
with open(path,mode="w") as f:
|
|||
|
f.write(final)
|
|||
|
f.close()
|