wiki_monitor/checkspell.py

39 lines
1.2 KiB
Python

#encoding: utf-8
"""
Simple script checking plain text spelling from of a web page.
"""
import argparse
import subprocess
from bs4 import BeautifulSoup
import requests
parser = argparse.ArgumentParser(description="Check spell from a webpage, using language from webpage if possible.")
parser.add_argument("URL", type=str, help="URL to check spell from.")
parser.add_argument("-l", "--language", type=str, help="Language to use if not specified by the webpage itself.", default="en_GB")
args = parser.parse_args()
# Getting webpage content
result = requests.get(args.URL)
language = result.headers.get("Content-language", args.language)
soup = BeautifulSoup(result.content.replace(b"<", b" <"), features="html.parser")
# Removing script tags
for tag in soup.findAll("script"):
tag.extract()
# Checking spell
output = subprocess.run(
["hunspell", "-d", language, "-a"],
universal_newlines=True,
input=soup.text,
stdout=subprocess.PIPE,
check=True,
)
# Parse hunspell results to get problematic words
words = []
for line in output.stdout.splitlines():
if line.strip() != "" and line[0]=="&":
words.append(line.split()[1])
print("\n".join(list(set(words))))