From 50b5833a3ff1801832dd10534f0f20f3f65e98c6 Mon Sep 17 00:00:00 2001 From: Akinori MUSHA Date: Tue, 25 Oct 2016 10:18:06 +0900 Subject: [PATCH] Improve encoding detection in WebsiteAgent Previously, WebsiteAgent always assumed that a content with no charset specified in the Content-Type header would be encoded in UTF-8. This enhancement is to make use of the encoding detector implemented in Nokogiri for HTML/XML documents, instead of blindly falling back to UTF-8. When the document `type` is `html` or `xml`, WebsiteAgent tries to detect the encoding of a fetched document from the presence of a BOM, XML declaration, or HTML `meta` tag. This fixes #1742. --- app/concerns/web_request_concern.rb | 5 + app/models/agents/website_agent.rb | 17 +- spec/models/agents/website_agent_spec.rb | 377 +++++++++++++++++------ 3 files changed, 301 insertions(+), 98 deletions(-) diff --git a/app/concerns/web_request_concern.rb b/app/concerns/web_request_concern.rb index 71a76427..538782db 100644 --- a/app/concerns/web_request_concern.rb +++ b/app/concerns/web_request_concern.rb @@ -46,6 +46,8 @@ module WebRequestConcern # Never try to transcode a binary content next end + # Return body as binary if default_encoding is nil + next if encoding.nil? end body.encode!(Encoding::UTF_8, encoding) end @@ -89,6 +91,9 @@ module WebRequestConcern end end + # The default encoding for a text content with no `charset` + # specified in the Content-Type header. Override this and make it + # return nil if you want to detect the encoding on your own. def default_encoding Encoding::UTF_8 end diff --git a/app/models/agents/website_agent.rb b/app/models/agents/website_agent.rb index c85dbbd7..dfa692a7 100644 --- a/app/models/agents/website_agent.rb +++ b/app/models/agents/website_agent.rb @@ -94,7 +94,12 @@ module Agents Set `uniqueness_look_back` to limit the number of events checked for uniqueness (typically for performance). This defaults to the larger of #{UNIQUENESS_LOOK_BACK} or #{UNIQUENESS_FACTOR}x the number of detected received results. - Set `force_encoding` to an encoding name if the website is known to respond with a missing, invalid, or wrong charset in the Content-Type header. Note that a text content without a charset is taken as encoded in UTF-8 (not ISO-8859-1). + Set `force_encoding` to an encoding name (such as `UTF-8` and `ISO-8859-1`) if the website is known to respond with a missing, invalid, or wrong charset in the Content-Type header. Below are the steps to detect the encoding of a fetched content: + + 1. If `force_encoding` is given, use the value. + 2. If the Content-Type header contains a charset parameter, use the value. + 3. When `type` is `html` or `xml`, check for the presence of a BOM, XML declaration with attribute "encoding", and an HTML meta tag with charset information. + 4. Fall back to UTF-8 (not ISO-8859-1). Set `user_agent` to a custom User-Agent name if the website does not like the default value (`#{default_user_agent}`). @@ -307,6 +312,16 @@ module Agents error "Error when fetching url: #{e.message}\n#{e.backtrace.join("\n")}" end + def default_encoding + case extraction_type + when 'html', 'xml' + # Let Nokogiri detect the encoding + nil + else + super + end + end + def handle_data(body, url, existing_payload) doc = parse(body) diff --git a/spec/models/agents/website_agent_spec.rb b/spec/models/agents/website_agent_spec.rb index 7a562f78..9315a4e1 100644 --- a/spec/models/agents/website_agent_spec.rb +++ b/spec/models/agents/website_agent_spec.rb @@ -318,110 +318,293 @@ describe Agents::WebsiteAgent do end describe 'encoding' do - it 'should be forced with force_encoding option' do - huginn = "\u{601d}\u{8003}" - stub_request(:any, /no-encoding/).to_return(body: { - value: huginn, - }.to_json.encode(Encoding::EUC_JP).b, headers: { - 'Content-Type' => 'application/json', - }, status: 200) - site = { - 'name' => "Some JSON Response", - 'expected_update_period_in_days' => "2", - 'type' => "json", - 'url' => "http://no-encoding.example.com", - 'mode' => 'on_change', - 'extract' => { - 'value' => { 'path' => 'value' }, - }, - 'force_encoding' => 'EUC-JP', - } - checker = Agents::WebsiteAgent.new(name: "No Encoding Site", options: site) - checker.user = users(:bob) - checker.save! - - expect { checker.check }.to change { Event.count }.by(1) - event = Event.last - expect(event.payload['value']).to eq(huginn) + let :huginn do + "\u{601d}\u{8003}" end - it 'should be overridden with force_encoding option' do - huginn = "\u{601d}\u{8003}" - stub_request(:any, /wrong-encoding/).to_return(body: { - value: huginn, - }.to_json.encode(Encoding::EUC_JP).b, headers: { - 'Content-Type' => 'application/json; UTF-8', - }, status: 200) - site = { - 'name' => "Some JSON Response", - 'expected_update_period_in_days' => "2", - 'type' => "json", - 'url' => "http://wrong-encoding.example.com", - 'mode' => 'on_change', - 'extract' => { - 'value' => { 'path' => 'value' }, - }, - 'force_encoding' => 'EUC-JP', - } - checker = Agents::WebsiteAgent.new(name: "Wrong Encoding Site", options: site) - checker.user = users(:bob) - checker.save! - - expect { checker.check }.to change { Event.count }.by(1) - event = Event.last - expect(event.payload['value']).to eq(huginn) + let :odin do + "\u{d3}\u{f0}inn" end - it 'should be determined by charset in Content-Type' do - huginn = "\u{601d}\u{8003}" - stub_request(:any, /charset-euc-jp/).to_return(body: { - value: huginn, - }.to_json.encode(Encoding::EUC_JP), headers: { - 'Content-Type' => 'application/json; charset=EUC-JP', - }, status: 200) - site = { - 'name' => "Some JSON Response", - 'expected_update_period_in_days' => "2", - 'type' => "json", - 'url' => "http://charset-euc-jp.example.com", - 'mode' => 'on_change', - 'extract' => { - 'value' => { 'path' => 'value' }, - }, - } - checker = Agents::WebsiteAgent.new(name: "Charset reader", options: site) - checker.user = users(:bob) - checker.save! - - expect { checker.check }.to change { Event.count }.by(1) - event = Event.last - expect(event.payload['value']).to eq(huginn) + let :url do + 'http://encoding-test.example.com/' end - it 'should default to UTF-8 when unknown charset is found' do - huginn = "\u{601d}\u{8003}" - stub_request(:any, /charset-unknown/).to_return(body: { - value: huginn, - }.to_json.b, headers: { - 'Content-Type' => 'application/json; charset=unicode', - }, status: 200) - site = { - 'name' => "Some JSON Response", - 'expected_update_period_in_days' => "2", - 'type' => "json", - 'url' => "http://charset-unknown.example.com", - 'mode' => 'on_change', - 'extract' => { - 'value' => { 'path' => 'value' }, - }, - } - checker = Agents::WebsiteAgent.new(name: "Charset reader", options: site) - checker.user = users(:bob) - checker.save! + let :content_type do + raise 'define me' + end - expect { checker.check }.to change { Event.count }.by(1) - event = Event.last - expect(event.payload['value']).to eq(huginn) + let :body do + raise 'define me' + end + + before do + stub_request(:any, url).to_return( + headers: { + 'Content-Type' => content_type, + }, + body: body.b, + status: 200) + end + + let :options do + { + 'name' => 'Some agent', + 'expected_update_period_in_days' => '2', + 'url' => url, + 'mode' => 'on_change', + } + end + + let :checker do + Agents::WebsiteAgent.create!(name: 'Encoding Checker', options: options) { |agent| + agent.user = users(:bob) + } + end + + context 'with no encoding information' do + context 'for a JSON file' do + let :content_type do + 'application/json' + end + + let :body do + { + value: huginn, + }.to_json + end + + let :options do + super().merge( + 'type' => 'json', + 'extract' => { + 'value' => { 'path' => 'value' } + } + ) + end + + it 'should be assumed to be UTF-8' do + expect { checker.check }.to change { Event.count }.by(1) + event = Event.last + expect(event.payload['value']).to eq(huginn) + end + end + + context 'for an HTML file' do + let :content_type do + 'text/html' + end + + let :options do + super().merge( + 'type' => 'html', + 'extract' => { + 'value' => { 'css' => 'title', 'value' => 'string(.)' } + } + ) + end + + context 'with a charset in the header' do + let :content_type do + super() + '; charset=iso-8859-1' + end + + let :body do + <<-HTML.encode(Encoding::ISO_8859_1) + +#{odin} +

Hello, world. + HTML + end + + it 'should be detected from it' do + expect { checker.check }.to change { Event.count }.by(1) + event = Event.last + expect(event.payload['value']).to eq(odin) + end + end + + context 'with no charset in the header' do + let :body do + <<-HTML.encode(Encoding::ISO_8859_1) + + +#{odin} +

Hello, world. + HTML + end + + it 'should be detected from a meta tag' do + expect { checker.check }.to change { Event.count }.by(1) + event = Event.last + expect(event.payload['value']).to eq(odin) + end + end + + context 'with charset desclarations both in the header and in the content' do + let :content_type do + super() + '; charset=iso-8859-1' + end + + let :body do + <<-HTML.encode(Encoding::ISO_8859_1) + + +#{odin} +

Hello, world. + HTML + end + + it 'should be detected as that of the header' do + expect { checker.check }.to change { Event.count }.by(1) + event = Event.last + expect(event.payload['value']).to eq(odin) + end + end + end + + context 'for an XML file' do + let :content_type do + 'application/xml' + end + + let :options do + super().merge( + 'type' => 'xml', + 'extract' => { + 'value' => { 'xpath' => '/root/message', 'value' => 'string(.)' } + } + ) + end + + context 'with a charset in the header' do + let :content_type do + super() + '; charset=euc-jp' + end + + let :body do + <<-XML.encode(Encoding::EUC_JP) + + + #{huginn} + + XML + end + + it 'should be detected from it' do + expect { checker.check }.to change { Event.count }.by(1) + event = Event.last + expect(event.payload['value']).to eq(huginn) + end + end + + context 'with no charset in the header' do + context 'but in XML declaration' do + let :body do + <<-XML.encode(Encoding::EUC_JP) + + + #{huginn} + + XML + end + + it 'should be detected' do + expect { checker.check }.to change { Event.count }.by(1) + event = Event.last + expect(event.payload['value']).to eq(huginn) + end + end + + context 'but having a BOM' do + let :body do + <<-XML.encode(Encoding::UTF_16LE) +\u{feff} + + #{huginn} + + XML + end + + it 'should be detected' do + expect { checker.check }.to change { Event.count }.by(1) + event = Event.last + expect(event.payload['value']).to eq(huginn) + end + end + end + end + end + + context 'when force_encoding option is specified' do + let :options do + super().merge( + 'force_encoding' => 'EUC-JP' + ) + end + + context 'for a JSON file' do + let :content_type do + 'application/json' + end + + let :body do + { + value: huginn, + }.to_json.encode(Encoding::EUC_JP) + end + + let :options do + super().merge( + 'type' => 'json', + 'extract' => { + 'value' => { 'path' => 'value' } + } + ) + end + + it 'should be forced' do + expect { checker.check }.to change { Event.count }.by(1) + event = Event.last + expect(event.payload['value']).to eq(huginn) + end + end + + context 'for an HTML file' do + let :content_type do + 'text/html' + end + + context 'with charset specified in the header and the content' do + let :content_type do + super() + '; charset=UTF-8' + end + + let :body do + <<-HTML.encode(Encoding::EUC_JP) + + +#{huginn} +

Hello, world. + HTML + end + + let :options do + super().merge( + 'type' => 'html', + 'extract' => { + 'value' => { 'css' => 'title', 'value' => 'string(.)' } + } + ) + end + + it 'should still be forced' do + expect { checker.check }.to change { Event.count }.by(1) + event = Event.last + expect(event.payload['value']).to eq(huginn) + end + end + end end end