Improve encoding detection in WebsiteAgent

Previously, WebsiteAgent always assumed that a content with no charset
specified in the Content-Type header would be encoded in UTF-8.  This
enhancement is to make use of the encoding detector implemented in
Nokogiri for HTML/XML documents, instead of blindly falling back to
UTF-8.

When the document `type` is `html` or `xml`, WebsiteAgent tries to
detect the encoding of a fetched document from the presence of a BOM,
XML declaration, or HTML `meta` tag.

This fixes #1742.
This commit is contained in:
Akinori MUSHA 2016-10-25 10:18:06 +09:00
parent 950975d7a8
commit 50b5833a3f
3 changed files with 301 additions and 98 deletions

View file

@ -46,6 +46,8 @@ module WebRequestConcern
# Never try to transcode a binary content
next
end
# Return body as binary if default_encoding is nil
next if encoding.nil?
end
body.encode!(Encoding::UTF_8, encoding)
end
@ -89,6 +91,9 @@ module WebRequestConcern
end
end
# The default encoding for a text content with no `charset`
# specified in the Content-Type header. Override this and make it
# return nil if you want to detect the encoding on your own.
def default_encoding
Encoding::UTF_8
end

View file

@ -94,7 +94,12 @@ module Agents
Set `uniqueness_look_back` to limit the number of events checked for uniqueness (typically for performance). This defaults to the larger of #{UNIQUENESS_LOOK_BACK} or #{UNIQUENESS_FACTOR}x the number of detected received results.
Set `force_encoding` to an encoding name if the website is known to respond with a missing, invalid, or wrong charset in the Content-Type header. Note that a text content without a charset is taken as encoded in UTF-8 (not ISO-8859-1).
Set `force_encoding` to an encoding name (such as `UTF-8` and `ISO-8859-1`) if the website is known to respond with a missing, invalid, or wrong charset in the Content-Type header. Below are the steps to detect the encoding of a fetched content:
1. If `force_encoding` is given, use the value.
2. If the Content-Type header contains a charset parameter, use the value.
3. When `type` is `html` or `xml`, check for the presence of a BOM, XML declaration with attribute "encoding", and an HTML meta tag with charset information.
4. Fall back to UTF-8 (not ISO-8859-1).
Set `user_agent` to a custom User-Agent name if the website does not like the default value (`#{default_user_agent}`).
@ -307,6 +312,16 @@ module Agents
error "Error when fetching url: #{e.message}\n#{e.backtrace.join("\n")}"
end
def default_encoding
case extraction_type
when 'html', 'xml'
# Let Nokogiri detect the encoding
nil
else
super
end
end
def handle_data(body, url, existing_payload)
doc = parse(body)

View file

@ -318,110 +318,293 @@ describe Agents::WebsiteAgent do
end
describe 'encoding' do
it 'should be forced with force_encoding option' do
huginn = "\u{601d}\u{8003}"
stub_request(:any, /no-encoding/).to_return(body: {
value: huginn,
}.to_json.encode(Encoding::EUC_JP).b, headers: {
'Content-Type' => 'application/json',
}, status: 200)
site = {
'name' => "Some JSON Response",
'expected_update_period_in_days' => "2",
'type' => "json",
'url' => "http://no-encoding.example.com",
'mode' => 'on_change',
'extract' => {
'value' => { 'path' => 'value' },
},
'force_encoding' => 'EUC-JP',
}
checker = Agents::WebsiteAgent.new(name: "No Encoding Site", options: site)
checker.user = users(:bob)
checker.save!
expect { checker.check }.to change { Event.count }.by(1)
event = Event.last
expect(event.payload['value']).to eq(huginn)
let :huginn do
"\u{601d}\u{8003}"
end
it 'should be overridden with force_encoding option' do
huginn = "\u{601d}\u{8003}"
stub_request(:any, /wrong-encoding/).to_return(body: {
value: huginn,
}.to_json.encode(Encoding::EUC_JP).b, headers: {
'Content-Type' => 'application/json; UTF-8',
}, status: 200)
site = {
'name' => "Some JSON Response",
'expected_update_period_in_days' => "2",
'type' => "json",
'url' => "http://wrong-encoding.example.com",
'mode' => 'on_change',
'extract' => {
'value' => { 'path' => 'value' },
},
'force_encoding' => 'EUC-JP',
}
checker = Agents::WebsiteAgent.new(name: "Wrong Encoding Site", options: site)
checker.user = users(:bob)
checker.save!
expect { checker.check }.to change { Event.count }.by(1)
event = Event.last
expect(event.payload['value']).to eq(huginn)
let :odin do
"\u{d3}\u{f0}inn"
end
it 'should be determined by charset in Content-Type' do
huginn = "\u{601d}\u{8003}"
stub_request(:any, /charset-euc-jp/).to_return(body: {
value: huginn,
}.to_json.encode(Encoding::EUC_JP), headers: {
'Content-Type' => 'application/json; charset=EUC-JP',
}, status: 200)
site = {
'name' => "Some JSON Response",
'expected_update_period_in_days' => "2",
'type' => "json",
'url' => "http://charset-euc-jp.example.com",
'mode' => 'on_change',
'extract' => {
'value' => { 'path' => 'value' },
},
}
checker = Agents::WebsiteAgent.new(name: "Charset reader", options: site)
checker.user = users(:bob)
checker.save!
expect { checker.check }.to change { Event.count }.by(1)
event = Event.last
expect(event.payload['value']).to eq(huginn)
let :url do
'http://encoding-test.example.com/'
end
it 'should default to UTF-8 when unknown charset is found' do
huginn = "\u{601d}\u{8003}"
stub_request(:any, /charset-unknown/).to_return(body: {
value: huginn,
}.to_json.b, headers: {
'Content-Type' => 'application/json; charset=unicode',
}, status: 200)
site = {
'name' => "Some JSON Response",
'expected_update_period_in_days' => "2",
'type' => "json",
'url' => "http://charset-unknown.example.com",
'mode' => 'on_change',
'extract' => {
'value' => { 'path' => 'value' },
},
}
checker = Agents::WebsiteAgent.new(name: "Charset reader", options: site)
checker.user = users(:bob)
checker.save!
let :content_type do
raise 'define me'
end
expect { checker.check }.to change { Event.count }.by(1)
event = Event.last
expect(event.payload['value']).to eq(huginn)
let :body do
raise 'define me'
end
before do
stub_request(:any, url).to_return(
headers: {
'Content-Type' => content_type,
},
body: body.b,
status: 200)
end
let :options do
{
'name' => 'Some agent',
'expected_update_period_in_days' => '2',
'url' => url,
'mode' => 'on_change',
}
end
let :checker do
Agents::WebsiteAgent.create!(name: 'Encoding Checker', options: options) { |agent|
agent.user = users(:bob)
}
end
context 'with no encoding information' do
context 'for a JSON file' do
let :content_type do
'application/json'
end
let :body do
{
value: huginn,
}.to_json
end
let :options do
super().merge(
'type' => 'json',
'extract' => {
'value' => { 'path' => 'value' }
}
)
end
it 'should be assumed to be UTF-8' do
expect { checker.check }.to change { Event.count }.by(1)
event = Event.last
expect(event.payload['value']).to eq(huginn)
end
end
context 'for an HTML file' do
let :content_type do
'text/html'
end
let :options do
super().merge(
'type' => 'html',
'extract' => {
'value' => { 'css' => 'title', 'value' => 'string(.)' }
}
)
end
context 'with a charset in the header' do
let :content_type do
super() + '; charset=iso-8859-1'
end
let :body do
<<-HTML.encode(Encoding::ISO_8859_1)
<!DOCTYPE html>
<title>#{odin}</title>
<p>Hello, world.
HTML
end
it 'should be detected from it' do
expect { checker.check }.to change { Event.count }.by(1)
event = Event.last
expect(event.payload['value']).to eq(odin)
end
end
context 'with no charset in the header' do
let :body do
<<-HTML.encode(Encoding::ISO_8859_1)
<!DOCTYPE html>
<meta charset="iso-8859-1">
<title>#{odin}</title>
<p>Hello, world.
HTML
end
it 'should be detected from a meta tag' do
expect { checker.check }.to change { Event.count }.by(1)
event = Event.last
expect(event.payload['value']).to eq(odin)
end
end
context 'with charset desclarations both in the header and in the content' do
let :content_type do
super() + '; charset=iso-8859-1'
end
let :body do
<<-HTML.encode(Encoding::ISO_8859_1)
<!DOCTYPE html>
<meta charset="UTF-8">
<title>#{odin}</title>
<p>Hello, world.
HTML
end
it 'should be detected as that of the header' do
expect { checker.check }.to change { Event.count }.by(1)
event = Event.last
expect(event.payload['value']).to eq(odin)
end
end
end
context 'for an XML file' do
let :content_type do
'application/xml'
end
let :options do
super().merge(
'type' => 'xml',
'extract' => {
'value' => { 'xpath' => '/root/message', 'value' => 'string(.)' }
}
)
end
context 'with a charset in the header' do
let :content_type do
super() + '; charset=euc-jp'
end
let :body do
<<-XML.encode(Encoding::EUC_JP)
<?xml version="1.0"?>
<root>
<message>#{huginn}</message>
</root>
XML
end
it 'should be detected from it' do
expect { checker.check }.to change { Event.count }.by(1)
event = Event.last
expect(event.payload['value']).to eq(huginn)
end
end
context 'with no charset in the header' do
context 'but in XML declaration' do
let :body do
<<-XML.encode(Encoding::EUC_JP)
<?xml version="1.0" encoding="euc-jp"?>
<root>
<message>#{huginn}</message>
</root>
XML
end
it 'should be detected' do
expect { checker.check }.to change { Event.count }.by(1)
event = Event.last
expect(event.payload['value']).to eq(huginn)
end
end
context 'but having a BOM' do
let :body do
<<-XML.encode(Encoding::UTF_16LE)
\u{feff}<?xml version="1.0"?>
<root>
<message>#{huginn}</message>
</root>
XML
end
it 'should be detected' do
expect { checker.check }.to change { Event.count }.by(1)
event = Event.last
expect(event.payload['value']).to eq(huginn)
end
end
end
end
end
context 'when force_encoding option is specified' do
let :options do
super().merge(
'force_encoding' => 'EUC-JP'
)
end
context 'for a JSON file' do
let :content_type do
'application/json'
end
let :body do
{
value: huginn,
}.to_json.encode(Encoding::EUC_JP)
end
let :options do
super().merge(
'type' => 'json',
'extract' => {
'value' => { 'path' => 'value' }
}
)
end
it 'should be forced' do
expect { checker.check }.to change { Event.count }.by(1)
event = Event.last
expect(event.payload['value']).to eq(huginn)
end
end
context 'for an HTML file' do
let :content_type do
'text/html'
end
context 'with charset specified in the header and the content' do
let :content_type do
super() + '; charset=UTF-8'
end
let :body do
<<-HTML.encode(Encoding::EUC_JP)
<!DOCTYPE html>
<meta charset="UTF-8"/>
<title>#{huginn}</title>
<p>Hello, world.
HTML
end
let :options do
super().merge(
'type' => 'html',
'extract' => {
'value' => { 'css' => 'title', 'value' => 'string(.)' }
}
)
end
it 'should still be forced' do
expect { checker.check }.to change { Event.count }.by(1)
event = Event.last
expect(event.payload['value']).to eq(huginn)
end
end
end
end
end