wiki_monitor/checkspell.py

#encoding: utf-8
"""
Simple script checking plain text spelling from of a web page.
"""

import argparse
import subprocess

from bs4 import BeautifulSoup
import requests

parser = argparse.ArgumentParser(description="Check spell from a webpage, using language from webpage if possible.")
parser.add_argument("URL", type=str, help="URL to check spell from.")
parser.add_argument("-l", "--language", type=str, help="Language to use if not specified by the webpage itself.", default="en_GB")
args = parser.parse_args()

# Getting webpage content
result = requests.get(args.URL)
language = result.headers.get("Content-language", args.language)
soup = BeautifulSoup(result.content.replace(b"<", b" <"), features="html.parser")
# Removing script tags
for tag in soup.findAll("script"):
    tag.extract()
# Checking spell
output = subprocess.run(
    ["hunspell", "-d", language, "-a"],
    universal_newlines=True,
    input=soup.text,
    stdout=subprocess.PIPE,
    check=True,
)
# Parse hunspell results to get problematic words
words = []
for line in output.stdout.splitlines():
    if line.strip() != "" and line[0]=="&":
        words.append(line.split()[1])
print("\n".join(list(set(words))))