#encoding: utf-8 """ Simple script checking plain text spelling from of a web page. """ import argparse import subprocess from bs4 import BeautifulSoup import requests parser = argparse.ArgumentParser(description="Check spell from a webpage, using language from webpage if possible.") parser.add_argument("URL", type=str, help="URL to check spell from.") parser.add_argument("-l", "--language", type=str, help="Language to use if not specified by the webpage itself.", default="en_GB") args = parser.parse_args() # Getting webpage content result = requests.get(args.URL) language = result.headers.get("Content-language", args.language) soup = BeautifulSoup(result.content.replace(b"<", b" <"), features="html.parser") # Removing script tags for tag in soup.findAll("script"): tag.extract() # Checking spell output = subprocess.run( ["hunspell", "-d", language, "-a"], universal_newlines=True, input=soup.text, stdout=subprocess.PIPE, check=True, ) # Parse hunspell results to get problematic words words = [] for line in output.stdout.splitlines(): if line.strip() != "" and line[0]=="&": words.append(line.split()[1]) print("\n".join(list(set(words))))