Mise en place de manière plus propre

2021-01-27 08:32:53 +01:00 · 2021-01-27 08:32:53 +01:00 · cdfab7c699
commit cdfab7c699
parent 2a9287db68
9 changed files with 174 additions and 17 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,2 @@
+.venv
+whitelist*
--- a/README.md
+++ b/README.md
@ -0,0 +1,52 @@
+# Surveillance d'un wiki type mediawiki pour l'April
+
+D'un côté, on récupère les modifications faites sur la dernière heure écoulée
+
+Soit, via un cron :
+
+    0 * * * * /path/to/venv/python feed_read.py -d 60 "https://wiki.april.org/api.php?action=feedrecentchanges&feedformat=atom" >> april_feed
+
+Soit via une commande en ligne :
+
+    while true; do python feed_read.py -d 60 "https://wiki.april.org/api.php?action=feedrecentchanges&feedformat=atom" >> april_feed; sleep 60m; done
+
+
+Cette commande récupère les dernières modifications faites et les stocke dans un fichier "april_feed".
+C'est ce même fichier que l'on va exploiter pour vérifier et prévenir, si besoin, qu'il y a des choses à faire.
+
+Via la commande :
+
+    tail -f april_feed |python april_wiki.py
+
+On absorbe les dernières modifications et on vérifie :
+
+    * l'auteur de la modification
+    * la page modifiée finale
+
+Si l'auteur est inconnu, on signale son nom et la page modifiée.
+Qu'il soit connu ou non, si la page contient des fautes d'orthographe, on signale la page et tous les mots trouvés.
+
+Deux fichiers spécifiques sont utilisés (créés automatiquement et vides par défaut) :
+
+    * whitelist_authors
+    * whitelist_words
+
+Le premier contient le nom des auteurs connus dont on n'a pas besoin de se méfier. Ce sont des contributeurs connus du
+wiki et leur nom est donc stocké dans ce fichier. Ce dernier étant rechargé à chaque nouvelle modification à traiter, il
+peut être amendé à tout moment.
+
+Le second fichier contient les "mots" que le dictionnaire ne connait pas et qui ne sont pas considérés comme "mauvais". Il
+s'agit de pseudonymes, d'acronymes, etc...
+
+
+## Installation
+
+Le mieux étant toujours de créer un environnement python dédié, les commandes pour l'utilisaiton :
+
+    python3 -m venv .venv
+    source .venv/bin/activate
+    pip install -r requirements.txt
+
+À partir de là, vous pouvez donc utiliser les scripts soit en ayant activé l'environnemet (activate),
+soit en utilisant l'exécutable python situé dans le répertoire de l'environnement.
+
--- a/1
+++ b/1
@ -0,0 +1 @@
+2021-01-27T07:08:52Z	Mindiell	Utilisateur:Mindiell	https://wiki.april.org/index.php?title=Utilisateur:Mindiell&diff=91609&oldid=91524
--- a/april_wiki.py
+++ b/april_wiki.py
@ -0,0 +1,53 @@
+# encoding: utf-8
+"""
+Ce script s'appuie sur d'autres scripts pour vérifier les modifications du wiki
+de l'April.
+"""
+
+import subprocess
+import sys
+
+# On reçoit continuellement des modifications du wiki
+for line in sys.stdin:
+    # On recharge les auteurs connus
+    try:
+        with open("whitelist_authors") as authors_handle:
+            authors = authors_handle.read().splitlines()
+    except:
+        open("whitelist_authors", "a")
+        authors = []
+
+    # On recharge les mots connus qu'on ne souhaite pas corriger
+    try:
+        with open("whitelist_words") as words_handle:
+            words = words_handle.read().splitlines()
+    except:
+        open("whitelist_words", "a")
+        words = []
+
+    # On extrait les informations remontées
+    _, author, _, url = line.split("\t")
+    # On nettoie l'url
+    url = "&".join(url.split("&")[:1])
+    # On vérifie l'orthographe de la page
+    output = subprocess.run(
+        ["python", "checkspell.py", url],
+        universal_newlines=True,
+        stdout=subprocess.PIPE,
+        check=True,
+    )
+    # Pour chaque mot remonté, on vérifie qu'il n'est pas dans la liste blanche
+    smells = []
+    for word in output.stdout.split("\n"):
+        if word != "" and word not in words:
+            smells.append(word)
+    # Si l'auteur est inconnu, on pointe le lien vers la page modifiée / créée
+    if author not in authors:
+        print(f"  {author} : {url}")
+    # S'il y a quelque chose à corriger
+    if len(smells) > 0:
+        print(f"  {url} : {smells}")
+    # On suate une ligne pour faire plus propre
+    if author not in authors or len(smells) > 0:
+        print("-"*12)
+
--- a/checkspell.py
+++ b/checkspell.py
@ -0,0 +1,38 @@
+#encoding: utf-8
+"""
+Simple script checking plain text spelling from of a web page.
+"""
+
+import argparse
+import subprocess
+
+from bs4 import BeautifulSoup
+import requests
+
+parser = argparse.ArgumentParser(description="Check spell from a webpage, using language from webpage if possible.")
+parser.add_argument("URL", type=str, help="URL to check spell from.")
+parser.add_argument("-l", "--language", type=str, help="Language to use if not specified by the webpage itself.", default="en_GB")
+args = parser.parse_args()
+
+# Getting webpage content
+result = requests.get(args.URL)
+language = result.headers.get("Content-language", args.language)
+soup = BeautifulSoup(result.content.replace(b"<", b" <"), features="html.parser")
+# Removing script tags
+for tag in soup.findAll("script"):
+    tag.extract()
+# Checking spell
+output = subprocess.run(
+    ["hunspell", "-d", language, "-a"],
+    universal_newlines=True,
+    input=soup.text,
+    stdout=subprocess.PIPE,
+    check=True,
+)
+# Parse hunspell results to get problematic words
+words = []
+for line in output.stdout.splitlines():
+    if line.strip() != "" and line[0]=="&":
+        words.append(line.split()[1])
+print("\n".join(list(set(words))))
+
--- a/feed_read.py
+++ b/feed_read.py
@ -1,12 +1,30 @@
 # encoding: utf-8
+"""
+Simple script to display RSS/Atom feeds in tabulation separated values.
+
+Values displayed are :
+- last updated
+- author(s) (comma separated)
+- title
+- link
+"""
+
+import argparse
+import time

 import feedparser

-for entry in feedparser.parse("https://wiki.april.org/api.php?hidebots=1&urlversion=1&days=7&limit=50&action=feedrecentchanges&feedformat=atom")["entries"]:
-    print("\t".join((
-        entry["updated"],
-        ",".join([a["name"] for a in entry["authors"]]),
-        entry["title"],
-        entry["link"],
-    )))
+parser = argparse.ArgumentParser(description="Display RSS/Atom feeds in tabulation separated values.")
+parser.add_argument("URL", type=str, help="URL to get feed from.")
+parser.add_argument("-d", "--delay", type=int, help="Delay from when get last news (in minutes).", default=10)
+args = parser.parse_args()

+for entry in feedparser.parse(args.URL)["entries"]:
+    delay = (time.mktime(time.gmtime()) - time.mktime(entry["updated_parsed"])) / 60
+    if delay <= args.delay:
+        print("\t".join((
+            entry["updated"],
+            ",".join([a["name"] for a in entry["authors"]]),
+            entry["title"],
+            entry["link"],
+        )))
--- a/1
+++ b/1
@ -1 +0,0 @@
-unspell -d fr_FR -a result_page.txt |grep "&"
--- a/mediawiki_spell.py
+++ b/mediawiki_spell.py
@ -1,9 +0,0 @@
-#encoding: utf-8
-
-from bs4 import BeautifulSoup
-import requests
-
-result = requests.get("https://wiki.april.org/index.php?title=Reunion_du_21_janvier_2021")
-soup = BeautifulSoup(result.content.replace(b"<", b" <"), features="html.parser")
-print(soup.text)
-
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,3 @@
+bs4
+feedparser
+requests
				`@ -0,0 +1 @@`
				`2021-01-27T07:08:52Z Mindiell Utilisateur:Mindiell https://wiki.april.org/index.php?title=Utilisateur:Mindiell&diff=91609&oldid=91524`
				`@ -1 +0,0 @@`
				`unspell -d fr_FR -a result_page.txt \|grep "&"`