__all__ = ['findHTMLMeta', 'MetaNotFound']
from html.parser import HTMLParser
import html.entities
import re
import sys
from openid.yadis.constants import YADIS_HEADER_NAME
# Size of the chunks to search at a time (also the amount that gets
# read at a time)
CHUNK_SIZE = 1024 * 16 # 16 KB
class ParseDone(Exception):
"""Exception to hold the URI that was located when the parse is
finished. If the parse finishes without finding the URI, set it to
None."""
class MetaNotFound(Exception):
"""Exception to hold the content of the page if we did not find
the appropriate tag"""
re_flags = re.IGNORECASE | re.UNICODE | re.VERBOSE
ent_pat = r'''
&
(?: \#x (?P [a-f0-9]+ )
| \# (?P \d+ )
| (?P \w+ )
)
;'''
ent_re = re.compile(ent_pat, re_flags)
def substituteMO(mo):
if mo.lastgroup == 'hex':
codepoint = int(mo.group('hex'), 16)
elif mo.lastgroup == 'dec':
codepoint = int(mo.group('dec'))
else:
assert mo.lastgroup == 'word'
codepoint = html.entities.name2codepoint.get(mo.group('word'))
if codepoint is None:
return mo.group()
else:
return chr(codepoint)
def substituteEntities(s):
return ent_re.sub(substituteMO, s)
class YadisHTMLParser(HTMLParser):
"""Parser that finds a meta http-equiv tag in the head of a html
document.
When feeding in data, if the tag is matched or it will never be
found, the parser will raise ParseDone with the uri as the first
attribute.
Parsing state diagram
=====================
Any unlisted input does not affect the state::
1, 2, 5 8
+--------------------------+ +-+
| | | |
4 | 3 1, 2, 5, 7 v | v
TOP -> HTML -> HEAD ----------> TERMINATED
| | ^ | ^ ^
| | 3 | | | |
| +------------+ +-> FOUND ------+ |
| 6 8 |
| 1, 2 |
+------------------------------------+
1. any of