From 0b3700999b73a1be4e71144a00d8b5d093dda126 Mon Sep 17 00:00:00 2001 From: Akinori MUSHA Date: Tue, 22 Nov 2016 12:01:43 +0900 Subject: [PATCH] Fix a double-decoding problem in RssAgent The SAX parser Feedjira uses (Nokogiri::XML::SAX) tries to detect the encoding of a document from the content even if it is already known and given. This results in a content being decoded twice by WebRequestConcern and the SAX parser if its encoding is declared in both the Content-Type header and the XML declaration. This commit makes RssAgent remove the `encoding` attribute from the XML declaration of a document if the encoding is already known by the Content-Type header. Fixes #1797. --- app/models/agents/rss_agent.rb | 16 +++++++++++++++- spec/data_fixtures/iso-8859-1.rss | 13 +++++++++++++ spec/models/agents/rss_agent_spec.rb | 13 +++++++++++++ 3 files changed, 41 insertions(+), 1 deletion(-) create mode 100644 spec/data_fixtures/iso-8859-1.rss diff --git a/app/models/agents/rss_agent.rb b/app/models/agents/rss_agent.rb index 19b5faf0..a1ac2095 100644 --- a/app/models/agents/rss_agent.rb +++ b/app/models/agents/rss_agent.rb @@ -132,7 +132,7 @@ module Agents begin response = faraday.get(url) if response.success? - feed = Feedjira::Feed.parse(response.body) + feed = Feedjira::Feed.parse(preprocessed_body(response)) new_events.concat feed_to_events(feed) else error "Failed to fetch #{url}: #{response.inspect}" @@ -170,6 +170,20 @@ module Agents require 'feedjira_extension' end + def preprocessed_body(response) + body = response.body + case body.encoding + when Encoding::ASCII_8BIT + # Encoding is unknown from the Content-Type, so let the SAX + # parser detect it from the content. + else + # Encoding is already known, so do not let the parser detect + # it from the XML declaration in the content. + body.sub!(/(<\?xml(?:\s+\w+\s*=\s*(['"]).*?\2)*)\s+encoding\s*=\s*(['"]).*?\3/, '\\1') + end + body + end + def feed_data(feed) type = case feed.class.name diff --git a/spec/data_fixtures/iso-8859-1.rss b/spec/data_fixtures/iso-8859-1.rss new file mode 100644 index 00000000..cf15baa4 --- /dev/null +++ b/spec/data_fixtures/iso-8859-1.rss @@ -0,0 +1,13 @@ + + + + Zeuhl + http://example.net/ + + Mëkanïk Zaïn + http://example.net/post/1 + http://example.net/post/1 + Mon, 21 Nov 2016 17:00:10 +0100 + + + diff --git a/spec/models/agents/rss_agent_spec.rb b/spec/models/agents/rss_agent_spec.rb index 0d01cea8..a70565f8 100644 --- a/spec/models/agents/rss_agent_spec.rb +++ b/spec/models/agents/rss_agent_spec.rb @@ -12,6 +12,7 @@ describe Agents::RssAgent do stub_request(:any, /SlickdealsnetFP/).to_return(:body => File.read(Rails.root.join("spec/data_fixtures/slickdeals.atom")), :status => 200) stub_request(:any, /onethingwell.org/).to_return(body: File.read(Rails.root.join("spec/data_fixtures/onethingwell.rss")), status: 200) stub_request(:any, /bad.onethingwell.org/).to_return(body: File.read(Rails.root.join("spec/data_fixtures/onethingwell.rss")).gsub(/(?<=)[^<]*/, ''), status: 200) + stub_request(:any, /iso-8859-1/).to_return(body: File.binread(Rails.root.join("spec/data_fixtures/iso-8859-1.rss")), headers: { 'Content-Type' => 'application/rss+xml; charset=ISO-8859-1' }, status: 200) end let(:agent) do @@ -283,6 +284,18 @@ describe Agents::RssAgent do expect(event.payload['links']).to eq([]) end end + + context 'with the encoding declared in both headers and the content' do + before do + @valid_options['url'] = 'http://example.org/iso-8859-1.rss' + end + + it "decodes the content properly" do + agent.check + event = agent.events.first + expect(event.payload['title']).to eq('Mëkanïk Zaïn') + end + end end describe 'logging errors with the feed url' do