You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
38 lines
1.2 KiB
38 lines
1.2 KiB
#encoding: utf-8 |
|
""" |
|
Simple script checking plain text spelling from of a web page. |
|
""" |
|
|
|
import argparse |
|
import subprocess |
|
|
|
from bs4 import BeautifulSoup |
|
import requests |
|
|
|
parser = argparse.ArgumentParser(description="Check spell from a webpage, using language from webpage if possible.") |
|
parser.add_argument("URL", type=str, help="URL to check spell from.") |
|
parser.add_argument("-l", "--language", type=str, help="Language to use if not specified by the webpage itself.", default="en_GB") |
|
args = parser.parse_args() |
|
|
|
# Getting webpage content |
|
result = requests.get(args.URL) |
|
language = result.headers.get("Content-language", args.language) |
|
soup = BeautifulSoup(result.content.replace(b"<", b" <"), features="html.parser") |
|
# Removing script tags |
|
for tag in soup.findAll("script"): |
|
tag.extract() |
|
# Checking spell |
|
output = subprocess.run( |
|
["hunspell", "-d", language, "-a"], |
|
universal_newlines=True, |
|
input=soup.text, |
|
stdout=subprocess.PIPE, |
|
check=True, |
|
) |
|
# Parse hunspell results to get problematic words |
|
words = [] |
|
for line in output.stdout.splitlines(): |
|
if line.strip() != "" and line[0]=="&": |
|
words.append(line.split()[1]) |
|
print("\n".join(list(set(words)))) |
|
|
|
|