Fix a double-decoding problem in RssAgent

The SAX parser Feedjira uses (Nokogiri::XML::SAX) tries to detect the
encoding of a document from the content even if it is already known
and given.  This results in a content being decoded twice by
WebRequestConcern and the SAX parser if its encoding is declared in
both the Content-Type header and the XML declaration.

This commit makes RssAgent remove the `encoding` attribute from the
XML declaration of a document if the encoding is already known by the
Content-Type header.

Fixes #1797.
This commit is contained in:
Akinori MUSHA 2016-11-22 12:01:43 +09:00
parent 6fb8fe2292
commit 0b3700999b
3 changed files with 41 additions and 1 deletions

View file

@ -132,7 +132,7 @@ module Agents
begin
response = faraday.get(url)
if response.success?
feed = Feedjira::Feed.parse(response.body)
feed = Feedjira::Feed.parse(preprocessed_body(response))
new_events.concat feed_to_events(feed)
else
error "Failed to fetch #{url}: #{response.inspect}"
@ -170,6 +170,20 @@ module Agents
require 'feedjira_extension'
end
def preprocessed_body(response)
body = response.body
case body.encoding
when Encoding::ASCII_8BIT
# Encoding is unknown from the Content-Type, so let the SAX
# parser detect it from the content.
else
# Encoding is already known, so do not let the parser detect
# it from the XML declaration in the content.
body.sub!(/(<\?xml(?:\s+\w+\s*=\s*(['"]).*?\2)*)\s+encoding\s*=\s*(['"]).*?\3/, '\\1')
end
body
end
def feed_data(feed)
type =
case feed.class.name

View file

@ -0,0 +1,13 @@
<?xml version="1.0" encoding="iso-8859-1" ?>
<rss xmlns:dc="http://purl.org/dc/elements/1.1/" version="2.0">
<channel>
<title>Zeuhl</title>
<link>http://example.net/</link>
<item>
<title>Mëkanïk Zaïn</title>
<link>http://example.net/post/1</link>
<guid>http://example.net/post/1</guid>
<pubDate>Mon, 21 Nov 2016 17:00:10 +0100</pubDate>
</item>
</channel>
</rss>

View file

@ -12,6 +12,7 @@ describe Agents::RssAgent do
stub_request(:any, /SlickdealsnetFP/).to_return(:body => File.read(Rails.root.join("spec/data_fixtures/slickdeals.atom")), :status => 200)
stub_request(:any, /onethingwell.org/).to_return(body: File.read(Rails.root.join("spec/data_fixtures/onethingwell.rss")), status: 200)
stub_request(:any, /bad.onethingwell.org/).to_return(body: File.read(Rails.root.join("spec/data_fixtures/onethingwell.rss")).gsub(/(?<=<link>)[^<]*/, ''), status: 200)
stub_request(:any, /iso-8859-1/).to_return(body: File.binread(Rails.root.join("spec/data_fixtures/iso-8859-1.rss")), headers: { 'Content-Type' => 'application/rss+xml; charset=ISO-8859-1' }, status: 200)
end
let(:agent) do
@ -283,6 +284,18 @@ describe Agents::RssAgent do
expect(event.payload['links']).to eq([])
end
end
context 'with the encoding declared in both headers and the content' do
before do
@valid_options['url'] = 'http://example.org/iso-8859-1.rss'
end
it "decodes the content properly" do
agent.check
event = agent.events.first
expect(event.payload['title']).to eq('Mëkanïk Zaïn')
end
end
end
describe 'logging errors with the feed url' do