Fix inefficiencies in auto-linking code (#16506)

The auto-linking code basically rewrote the whole string escaping non-ascii
characters in an inefficient way, and building a full character offset map
between the unescaped and escaped texts before sending the contents to
TwitterText's extractor.

Instead of doing that, this commit changes the TwitterText regexps to include
valid IRI characters in addition to valid URI characters.
This commit is contained in:
Claire 2021-07-15 15:56:58 +02:00 committed by GitHub
parent 3dcf3f2a3a
commit 211d5c3c30
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 5 additions and 30 deletions

View File

@ -214,39 +214,10 @@ class Formatter
result.flatten.join result.flatten.join
end end
UNICODE_ESCAPE_BLACKLIST_RE = /\p{Z}|\p{P}/
def utf8_friendly_extractor(text, options = {}) def utf8_friendly_extractor(text, options = {})
old_to_new_index = [0]
escaped = text.chars.map do |c|
output = begin
if c.ord.to_s(16).length > 2 && !UNICODE_ESCAPE_BLACKLIST_RE.match?(c)
CGI.escape(c)
else
c
end
end
old_to_new_index << old_to_new_index.last + output.length
output
end.join
# Note: I couldn't obtain list_slug with @user/list-name format # Note: I couldn't obtain list_slug with @user/list-name format
# for mention so this requires additional check # for mention so this requires additional check
special = Extractor.extract_urls_with_indices(escaped, options).map do |extract| special = Extractor.extract_urls_with_indices(text, options)
new_indices = [
old_to_new_index.find_index(extract[:indices].first),
old_to_new_index.find_index(extract[:indices].last),
]
next extract.merge(
indices: new_indices,
url: text[new_indices.first..new_indices.last - 1]
)
end
standard = Extractor.extract_entities_with_indices(text, options) standard = Extractor.extract_entities_with_indices(text, options)
extra = Extractor.extract_extra_uris_with_indices(text, options) extra = Extractor.extract_extra_uris_with_indices(text, options)

View File

@ -24,6 +24,10 @@ module Twitter::TwitterText
) )
\) \)
/iox /iox
REGEXEN[:valid_iri_ucschar] = /[\u{A0}-\u{D7FF}\u{F900}-\u{FDCF}\u{FDF0}-\u{FFEF}\u{10000}-\u{1FFFD}\u{20000}-\u{2FFFD}\u{30000}-\u{3FFFD}\u{40000}-\u{4FFFD}\u{50000}-\u{5FFFD}\u{60000}-\u{6FFFD}\u{70000}-\u{7FFFD}\u{80000}-\u{8FFFD}\u{90000}-\u{9FFFD}\u{A0000}-\u{AFFFD}\u{B0000}-\u{BFFFD}\u{C0000}-\u{CFFFD}\u{D0000}-\u{DFFFD}\u{E1000}-\u{EFFFD}]/iou
REGEXEN[:valid_iri_iprivate] = /[\u{E000}-\u{F8FF}\u{F0000}-\u{FFFFD}\u{100000}-\u{10FFFD}]/iou
REGEXEN[:valid_url_query_chars] = /(?:#{REGEXEN[:valid_iri_ucschar]})|(?:#{REGEXEN[:valid_iri_iprivate]})|[a-z0-9!?\*'\(\);:&=\+\$\/%#\[\]\-_\.,~|@]/iou
REGEXEN[:valid_url_query_ending_chars] = /(?:#{REGEXEN[:valid_iri_ucschar]})|(?:#{REGEXEN[:valid_iri_iprivate]})|[a-z0-9_&=#\/\-]/iou
REGEXEN[:valid_url_path] = /(?: REGEXEN[:valid_url_path] = /(?:
(?: (?:
#{REGEXEN[:valid_general_url_path_chars]}* #{REGEXEN[:valid_general_url_path_chars]}*