From 7e79d576b57c73ec8d4185b4ce36b541030dbe50 Mon Sep 17 00:00:00 2001 From: Akinori MUSHA Date: Wed, 5 Oct 2016 14:17:04 +0900 Subject: [PATCH] Improve Utils.normalize_uri (#1719) * Improve Utils.normalize_uri Globally replacing generally unsafe characters in a URL would not fix invalid authorities and paths, so use Addressable::URI to normalize them when necessary. This should fix #1701. * Remove an unused function * Fix the test case to make sure an IPv6 address is supported --- lib/utils.rb | 29 ++++++++++++++++++++---- spec/data_fixtures/urlTest.html | 3 ++- spec/models/agents/website_agent_spec.rb | 9 ++++++-- 3 files changed, 33 insertions(+), 8 deletions(-) diff --git a/lib/utils.rb b/lib/utils.rb index 84a1a40d..c192cea2 100644 --- a/lib/utils.rb +++ b/lib/utils.rb @@ -1,5 +1,6 @@ require 'jsonpath' require 'cgi' +require 'addressable/uri' module Utils def self.unindent(s) @@ -25,11 +26,29 @@ module Utils begin URI(uri) rescue URI::Error - URI(uri.to_s.gsub(/[^\-_.!~*'()a-zA-Z\d;\/?:@&=+$,\[\]]+/) { |unsafe| - unsafe.bytes.each_with_object(String.new) { |uc, s| - s << sprintf('%%%02X', uc) - } - }.force_encoding(Encoding::US_ASCII)) + begin + URI(uri.to_s.gsub(/[^\-_.!~*'()a-zA-Z\d;\/?:@&=+$,\[\]]+/) { |unsafe| + unsafe.bytes.each_with_object(String.new) { |uc, s| + s << sprintf('%%%02X', uc) + } + }.force_encoding(Encoding::US_ASCII)) + rescue URI::Error => e + begin + auri = Addressable::URI.parse(uri.to_s) + rescue + # Do not leak Addressable::URI::InvalidURIError which + # callers might not expect. + raise e + else + # Addressable::URI#normalize! modifies the query and + # fragment components beyond escaping unsafe characters, so + # avoid using it. Otherwise `?a[]=%2F` would be normalized + # as `?a%5B%5D=/`, for example. + auri.site = auri.normalized_site + auri.path = auri.normalized_path + URI(auri.to_s) + end + end end end diff --git a/spec/data_fixtures/urlTest.html b/spec/data_fixtures/urlTest.html index 156e7a1c..445f48d7 100644 --- a/spec/data_fixtures/urlTest.html +++ b/spec/data_fixtures/urlTest.html @@ -12,6 +12,7 @@
  • unicode param
  • percent encoded url
  • percent encoded param
  • +
  • brackets
  • - \ No newline at end of file + diff --git a/spec/models/agents/website_agent_spec.rb b/spec/models/agents/website_agent_spec.rb index 4e1be8e2..ba21a354 100644 --- a/spec/models/agents/website_agent_spec.rb +++ b/spec/models/agents/website_agent_spec.rb @@ -1105,8 +1105,8 @@ fire: hot describe "#check" do before do - expect { @checker.check }.to change { Event.count }.by(7) - @events = Event.last(7) + expect { @checker.check }.to change { Event.count }.by(8) + @events = Event.last(8) end it "should check hostname" do @@ -1143,6 +1143,11 @@ fire: hot event = @events[6] expect(event.payload['url']).to eq("https://www.google.ca/search?q=%EC%9C%84%ED%82%A4%EB%B0%B1%EA%B3%BC:%EB%8C%80%EB%AC%B8") end + + it "should check url with unescaped brackets in the path component" do + event = @events[7] + expect(event.payload['url']).to eq("http://[::1]/path%5B%5D?query[]=foo") + end end end end