You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
39 lines
1.2 KiB
Python
39 lines
1.2 KiB
Python
#encoding: utf-8
|
|
"""
|
|
Simple script checking plain text spelling from of a web page.
|
|
"""
|
|
|
|
import argparse
|
|
import subprocess
|
|
|
|
from bs4 import BeautifulSoup
|
|
import requests
|
|
|
|
parser = argparse.ArgumentParser(description="Check spell from a webpage, using language from webpage if possible.")
|
|
parser.add_argument("URL", type=str, help="URL to check spell from.")
|
|
parser.add_argument("-l", "--language", type=str, help="Language to use if not specified by the webpage itself.", default="en_GB")
|
|
args = parser.parse_args()
|
|
|
|
# Getting webpage content
|
|
result = requests.get(args.URL)
|
|
language = result.headers.get("Content-language", args.language)
|
|
soup = BeautifulSoup(result.content.replace(b"<", b" <"), features="html.parser")
|
|
# Removing script tags
|
|
for tag in soup.findAll("script"):
|
|
tag.extract()
|
|
# Checking spell
|
|
output = subprocess.run(
|
|
["hunspell", "-d", language, "-a"],
|
|
universal_newlines=True,
|
|
input=soup.text,
|
|
stdout=subprocess.PIPE,
|
|
check=True,
|
|
)
|
|
# Parse hunspell results to get problematic words
|
|
words = []
|
|
for line in output.stdout.splitlines():
|
|
if line.strip() != "" and line[0]=="&":
|
|
words.append(line.split()[1])
|
|
print("\n".join(list(set(words))))
|
|
|