From 61cefbebf717326bd6ec3923e67e3702a24a0b24 Mon Sep 17 00:00:00 2001 From: Claire Date: Mon, 28 Mar 2022 20:51:51 +0200 Subject: [PATCH] Add advanced text formatting back into glitch-soc --- app/helpers/formatting_helper.rb | 2 +- app/lib/advanced_text_formatter.rb | 131 +++++++++++ app/lib/html_aware_formatter.rb | 6 +- lib/sanitize_ext/sanitize_config.rb | 57 +++-- spec/lib/advanced_text_formatter_spec.rb | 274 +++++++++++++++++++++++ spec/lib/sanitize_config_spec.rb | 18 +- 6 files changed, 459 insertions(+), 29 deletions(-) create mode 100644 app/lib/advanced_text_formatter.rb create mode 100644 spec/lib/advanced_text_formatter_spec.rb diff --git a/app/helpers/formatting_helper.rb b/app/helpers/formatting_helper.rb index e11156999..2a622ae0b 100644 --- a/app/helpers/formatting_helper.rb +++ b/app/helpers/formatting_helper.rb @@ -14,7 +14,7 @@ module FormattingHelper end def status_content_format(status) - html_aware_format(status.text, status.local?, preloaded_accounts: [status.account] + (status.respond_to?(:active_mentions) ? status.active_mentions.map(&:account) : [])) + html_aware_format(status.text, status.local?, preloaded_accounts: [status.account] + (status.respond_to?(:active_mentions) ? status.active_mentions.map(&:account) : []), content_type: status.content_type) end def account_bio_format(account) diff --git a/app/lib/advanced_text_formatter.rb b/app/lib/advanced_text_formatter.rb new file mode 100644 index 000000000..5ce87d306 --- /dev/null +++ b/app/lib/advanced_text_formatter.rb @@ -0,0 +1,131 @@ +# frozen_string_literal: true + +class AdvancedTextFormatter < TextFormatter + class HTMLRenderer < Redcarpet::Render::HTML + def initialize(options, &block) + super(options) + @format_link = block + end + + def block_code(code, _language) + <<~HTML.squish +
#{h(code).gsub("\n", '
')}
+ HTML + end + + def autolink(link, link_type) + return link if link_type == :email + @format_link.call(link) + end + end + + # @param [String] text + # @param [Hash] options + # @option options [Boolean] :multiline + # @option options [Boolean] :with_domains + # @option options [Boolean] :with_rel_me + # @option options [Array] :preloaded_accounts + # @option options [String] :content_type + def initialize(text, options = {}) + content_type = options.delete(:content_type) + super(text, options) + + @text = format_markdown(text) if content_type == 'text/markdown' + end + + # Differs from TextFormatter by not messing with newline after parsing + def to_s + return ''.html_safe if text.blank? + + html = rewrite do |entity| + if entity[:url] + link_to_url(entity) + elsif entity[:hashtag] + link_to_hashtag(entity) + elsif entity[:screen_name] + link_to_mention(entity) + end + end + + html.html_safe # rubocop:disable Rails/OutputSafety + end + + # Differs from `TextFormatter` by skipping HTML tags and entities + def entities + @entities ||= begin + gaps = [] + total_offset = 0 + + escaped = text.gsub(/<[^>]*>|&#[0-9]+;/) do |match| + total_offset += match.length - 1 + end_offset = Regexp.last_match.end(0) + gaps << [end_offset - total_offset, total_offset] + ' ' + end + + Extractor.extract_entities_with_indices(escaped, extract_url_without_protocol: false).map do |entity| + start_pos, end_pos = entity[:indices] + offset_idx = gaps.rindex { |gap| gap.first <= start_pos } + offset = offset_idx.nil? ? 0 : gaps[offset_idx].last + entity.merge(indices: [start_pos + offset, end_pos + offset]) + end + end + end + + private + + # Differs from `TextFormatter` in that it keeps HTML; but it sanitizes at the end to remain safe + def rewrite + entities.sort_by! do |entity| + entity[:indices].first + end + + result = ''.dup + + last_index = entities.reduce(0) do |index, entity| + indices = entity[:indices] + result << text[index...indices.first] + result << yield(entity) + indices.last + end + + result << text[last_index..-1] + + Sanitize.fragment(result, Sanitize::Config::MASTODON_OUTGOING) + end + + def format_markdown(html) + html = markdown_formatter.render(html) + html.delete("\r").delete("\n") + end + + def markdown_formatter + extensions = { + autolink: true, + no_intra_emphasis: true, + fenced_code_blocks: true, + disable_indented_code_blocks: true, + strikethrough: true, + lax_spacing: true, + space_after_headers: true, + superscript: true, + underline: true, + highlight: true, + footnotes: false, + } + + renderer = HTMLRenderer.new({ + filter_html: false, + escape_html: false, + no_images: true, + no_styles: true, + safe_links_only: true, + hard_wrap: true, + link_attributes: { target: '_blank', rel: 'nofollow noopener' }, + }) do |url| + link_to_url({ url: url }) + end + + Redcarpet::Markdown.new(renderer, extensions) + end +end diff --git a/app/lib/html_aware_formatter.rb b/app/lib/html_aware_formatter.rb index 64edba09b..7a1cd0340 100644 --- a/app/lib/html_aware_formatter.rb +++ b/app/lib/html_aware_formatter.rb @@ -33,6 +33,10 @@ class HtmlAwareFormatter end def linkify - TextFormatter.new(text, options).to_s + if %w(text/markdown text/html).include?(@options[:content_type]) + AdvancedTextFormatter.new(text, options).to_s + else + TextFormatter.new(text, options).to_s + end end end diff --git a/lib/sanitize_ext/sanitize_config.rb b/lib/sanitize_ext/sanitize_config.rb index ecaec2f84..935e1f4f6 100644 --- a/lib/sanitize_ext/sanitize_config.rb +++ b/lib/sanitize_ext/sanitize_config.rb @@ -55,18 +55,6 @@ class Sanitize end end - LINK_REL_TRANSFORMER = lambda do |env| - return unless env[:node_name] == 'a' and env[:node]['href'] - - node = env[:node] - - rel = (node['rel'] || '').split(' ') & ['tag'] - unless env[:config][:outgoing] && TagManager.instance.local_url?(node['href']) - rel += ['nofollow', 'noopener', 'noreferrer'] - end - node['rel'] = rel.join(' ') - end - UNSUPPORTED_HREF_TRANSFORMER = lambda do |env| return unless env[:node_name] == 'a' @@ -97,6 +85,7 @@ class Sanitize add_attributes: { 'a' => { + 'rel' => 'nofollow noopener noreferrer', 'target' => '_blank', }, }, @@ -110,7 +99,6 @@ class Sanitize CLASS_WHITELIST_TRANSFORMER, IMG_TAG_TRANSFORMER, UNSUPPORTED_HREF_TRANSFORMER, - LINK_REL_TRANSFORMER, ] ) @@ -135,5 +123,48 @@ class Sanitize 'source' => { 'src' => HTTP_PROTOCOLS } ) ) + + LINK_REL_TRANSFORMER = lambda do |env| + return unless env[:node_name] == 'a' && env[:node]['href'] + + node = env[:node] + + rel = (node['rel'] || '').split(' ') & ['tag'] + rel += ['nofollow', 'noopener', 'noreferrer'] unless TagManager.instance.local_url?(node['href']) + + if rel.empty? + node['rel']&.delete + else + node['rel'] = rel.join(' ') + end + end + + LINK_TARGET_TRANSFORMER = lambda do |env| + return unless env[:node_name] == 'a' && env[:node]['href'] + + node = env[:node] + if node['target'] != '_blank' && TagManager.instance.local_url?(node['href']) + node['target']&.delete + else + node['target'] = '_blank' + end + end + + MASTODON_OUTGOING ||= freeze_config MASTODON_STRICT.merge( + attributes: merge( + MASTODON_STRICT[:attributes], + 'a' => %w(href rel class title target) + ), + + add_attributes: {}, + + transformers: [ + CLASS_WHITELIST_TRANSFORMER, + IMG_TAG_TRANSFORMER, + UNSUPPORTED_HREF_TRANSFORMER, + LINK_REL_TRANSFORMER, + LINK_TARGET_TRANSFORMER, + ] + ) end end diff --git a/spec/lib/advanced_text_formatter_spec.rb b/spec/lib/advanced_text_formatter_spec.rb new file mode 100644 index 000000000..c097b86e1 --- /dev/null +++ b/spec/lib/advanced_text_formatter_spec.rb @@ -0,0 +1,274 @@ +require 'rails_helper' + +RSpec.describe AdvancedTextFormatter do + describe '#to_s' do + let(:preloaded_accounts) { nil } + let(:content_type) { 'text/markdown' } + + subject { described_class.new(text, preloaded_accounts: preloaded_accounts, content_type: content_type).to_s } + + context 'given a markdown source' do + let(:content_type) { 'text/markdown' } + + context 'given text containing plain text' do + let(:text) { 'text' } + + it 'paragraphizes the text' do + is_expected.to eq '

text

' + end + end + + context 'given text containing line feeds' do + let(:text) { "line\nfeed" } + + it 'removes line feeds' do + is_expected.not_to include "\n" + end + end + + context 'given some inline code using backticks' do + let(:text) { 'test `foo` bar' } + + it 'formats code using ' do + is_expected.to include 'test foo bar' + end + end + + context 'given some quote' do + let(:text) { "> foo\n\nbar" } + + it 'formats code using ' do + is_expected.to include '

foo

' + end + end + + context 'given text containing linkable mentions' do + let(:preloaded_accounts) { [Fabricate(:account, username: 'alice')] } + let(:text) { '@alice' } + + it 'creates a mention link' do + is_expected.to include '@alice' + end + end + + context 'given text containing unlinkable mentions' do + let(:preloaded_accounts) { [] } + let(:text) { '@alice' } + + it 'does not create a mention link' do + is_expected.to include '@alice' + end + end + + context 'given a stand-alone medium URL' do + let(:text) { 'https://hackernoon.com/the-power-to-build-communities-a-response-to-mark-zuckerberg-3f2cac9148a4' } + + it 'matches the full URL' do + is_expected.to include 'href="https://hackernoon.com/the-power-to-build-communities-a-response-to-mark-zuckerberg-3f2cac9148a4"' + end + end + + context 'given a stand-alone google URL' do + let(:text) { 'http://google.com' } + + it 'matches the full URL' do + is_expected.to include 'href="http://google.com"' + end + end + + context 'given a stand-alone URL with a newer TLD' do + let(:text) { 'http://example.gay' } + + it 'matches the full URL' do + is_expected.to include 'href="http://example.gay"' + end + end + + context 'given a stand-alone IDN URL' do + let(:text) { 'https://nic.みんな/' } + + it 'matches the full URL' do + is_expected.to include 'href="https://nic.みんな/"' + end + + it 'has display URL' do + is_expected.to include 'nic.みんな/' + end + end + + context 'given a URL with a trailing period' do + let(:text) { 'http://www.mcmansionhell.com/post/156408871451/50-states-of-mcmansion-hell-scottsdale-arizona. ' } + + it 'matches the full URL but not the period' do + is_expected.to include 'href="http://www.mcmansionhell.com/post/156408871451/50-states-of-mcmansion-hell-scottsdale-arizona"' + end + end + + context 'given a URL enclosed with parentheses' do + let(:text) { '(http://google.com/)' } + + it 'matches the full URL but not the parentheses' do + is_expected.to include 'href="http://google.com/"' + end + end + + context 'given a URL with a trailing exclamation point' do + let(:text) { 'http://www.google.com!' } + + it 'matches the full URL but not the exclamation point' do + is_expected.to include 'href="http://www.google.com"' + end + end + + context 'given a URL with a trailing single quote' do + let(:text) { "http://www.google.com'" } + + it 'matches the full URL but not the single quote' do + is_expected.to include 'href="http://www.google.com"' + end + end + end + + context 'given a URL with a trailing angle bracket' do + let(:text) { 'http://www.google.com>' } + + it 'matches the full URL but not the angle bracket' do + is_expected.to include 'href="http://www.google.com"' + end + end + + context 'given a URL with a query string' do + context 'with escaped unicode character' do + let(:text) { 'https://www.ruby-toolbox.com/search?utf8=%E2%9C%93&q=autolink' } + + it 'matches the full URL' do + is_expected.to include 'href="https://www.ruby-toolbox.com/search?utf8=%E2%9C%93&q=autolink"' + end + end + + context 'with unicode character' do + let(:text) { 'https://www.ruby-toolbox.com/search?utf8=✓&q=autolink' } + + it 'matches the full URL' do + is_expected.to include 'href="https://www.ruby-toolbox.com/search?utf8=✓&q=autolink"' + end + end + + context 'with unicode character at the end' do + let(:text) { 'https://www.ruby-toolbox.com/search?utf8=✓' } + + it 'matches the full URL' do + is_expected.to include 'href="https://www.ruby-toolbox.com/search?utf8=✓"' + end + end + + context 'with escaped and not escaped unicode characters' do + let(:text) { 'https://www.ruby-toolbox.com/search?utf8=%E2%9C%93&utf81=✓&q=autolink' } + + it 'preserves escaped unicode characters' do + is_expected.to include 'href="https://www.ruby-toolbox.com/search?utf8=%E2%9C%93&utf81=✓&q=autolink"' + end + end + + context 'given a URL with parentheses in it' do + let(:text) { 'https://en.wikipedia.org/wiki/Diaspora_(software)' } + + it 'matches the full URL' do + is_expected.to include 'href="https://en.wikipedia.org/wiki/Diaspora_(software)"' + end + end + + context 'given a URL in quotation marks' do + let(:text) { '"https://example.com/"' } + + it 'does not match the quotation marks' do + is_expected.to include 'href="https://example.com/"' + end + end + + context 'given a URL in angle brackets' do + let(:text) { '' } + + it 'does not match the angle brackets' do + is_expected.to include 'href="https://example.com/"' + end + end + + context 'given a URL containing unsafe code (XSS attack, invisible part)' do + let(:text) { %q{http://example.com/blahblahblahblah/a} } + + it 'does not include the HTML in the URL' do + is_expected.to include '"http://example.com/blahblahblahblah/a"' + end + + it 'does not include a script tag' do + is_expected.to_not include '' } + + it 'does not include a script tag' do + is_expected.to_not include '