From 50b5833a3ff1801832dd10534f0f20f3f65e98c6 Mon Sep 17 00:00:00 2001
From: Akinori MUSHA <knu@idaemons.org>
Date: Tue, 25 Oct 2016 10:18:06 +0900
Subject: [PATCH] Improve encoding detection in WebsiteAgent

Previously, WebsiteAgent always assumed that a content with no charset
specified in the Content-Type header would be encoded in UTF-8.  This
enhancement is to make use of the encoding detector implemented in
Nokogiri for HTML/XML documents, instead of blindly falling back to
UTF-8.

When the document `type` is `html` or `xml`, WebsiteAgent tries to
detect the encoding of a fetched document from the presence of a BOM,
XML declaration, or HTML `meta` tag.

This fixes #1742.
---
 app/concerns/web_request_concern.rb      |   5 +
 app/models/agents/website_agent.rb       |  17 +-
 spec/models/agents/website_agent_spec.rb | 377 +++++++++++++++++------
 3 files changed, 301 insertions(+), 98 deletions(-)

diff --git a/app/concerns/web_request_concern.rb b/app/concerns/web_request_concern.rb
index 71a76427..538782db 100644
--- a/app/concerns/web_request_concern.rb
+++ b/app/concerns/web_request_concern.rb
@@ -46,6 +46,8 @@ module WebRequestConcern
             # Never try to transcode a binary content
             next
           end
+          # Return body as binary if default_encoding is nil
+          next if encoding.nil?
         end
         body.encode!(Encoding::UTF_8, encoding)
       end
@@ -89,6 +91,9 @@ module WebRequestConcern
     end
   end
 
+  # The default encoding for a text content with no `charset`
+  # specified in the Content-Type header.  Override this and make it
+  # return nil if you want to detect the encoding on your own.
   def default_encoding
     Encoding::UTF_8
   end
diff --git a/app/models/agents/website_agent.rb b/app/models/agents/website_agent.rb
index c85dbbd7..dfa692a7 100644
--- a/app/models/agents/website_agent.rb
+++ b/app/models/agents/website_agent.rb
@@ -94,7 +94,12 @@ module Agents
 
       Set `uniqueness_look_back` to limit the number of events checked for uniqueness (typically for performance).  This defaults to the larger of #{UNIQUENESS_LOOK_BACK} or #{UNIQUENESS_FACTOR}x the number of detected received results.
 
-      Set `force_encoding` to an encoding name if the website is known to respond with a missing, invalid, or wrong charset in the Content-Type header.  Note that a text content without a charset is taken as encoded in UTF-8 (not ISO-8859-1).
+      Set `force_encoding` to an encoding name (such as `UTF-8` and `ISO-8859-1`) if the website is known to respond with a missing, invalid, or wrong charset in the Content-Type header.  Below are the steps to detect the encoding of a fetched content:
+
+      1. If `force_encoding` is given, use the value.
+      2. If the Content-Type header contains a charset parameter, use the value.
+      3. When `type` is `html` or `xml`, check for the presence of a BOM, XML declaration with attribute "encoding", and an HTML meta tag with charset information.
+      4. Fall back to UTF-8 (not ISO-8859-1).
 
       Set `user_agent` to a custom User-Agent name if the website does not like the default value (`#{default_user_agent}`).
 
@@ -307,6 +312,16 @@ module Agents
       error "Error when fetching url: #{e.message}\n#{e.backtrace.join("\n")}"
     end
 
+    def default_encoding
+      case extraction_type
+      when 'html', 'xml'
+        # Let Nokogiri detect the encoding
+        nil
+      else
+        super
+      end
+    end
+
     def handle_data(body, url, existing_payload)
       doc = parse(body)
 
diff --git a/spec/models/agents/website_agent_spec.rb b/spec/models/agents/website_agent_spec.rb
index 7a562f78..9315a4e1 100644
--- a/spec/models/agents/website_agent_spec.rb
+++ b/spec/models/agents/website_agent_spec.rb
@@ -318,110 +318,293 @@ describe Agents::WebsiteAgent do
     end
 
     describe 'encoding' do
-      it 'should be forced with force_encoding option' do
-        huginn = "\u{601d}\u{8003}"
-        stub_request(:any, /no-encoding/).to_return(body: {
-            value: huginn,
-          }.to_json.encode(Encoding::EUC_JP).b, headers: {
-            'Content-Type' => 'application/json',
-          }, status: 200)
-        site = {
-          'name' => "Some JSON Response",
-          'expected_update_period_in_days' => "2",
-          'type' => "json",
-          'url' => "http://no-encoding.example.com",
-          'mode' => 'on_change',
-          'extract' => {
-            'value' => { 'path' => 'value' },
-          },
-          'force_encoding' => 'EUC-JP',
-        }
-        checker = Agents::WebsiteAgent.new(name: "No Encoding Site", options: site)
-        checker.user = users(:bob)
-        checker.save!
-
-        expect { checker.check }.to change { Event.count }.by(1)
-        event = Event.last
-        expect(event.payload['value']).to eq(huginn)
+      let :huginn do
+        "\u{601d}\u{8003}"
       end
 
-      it 'should be overridden with force_encoding option' do
-        huginn = "\u{601d}\u{8003}"
-        stub_request(:any, /wrong-encoding/).to_return(body: {
-            value: huginn,
-          }.to_json.encode(Encoding::EUC_JP).b, headers: {
-            'Content-Type' => 'application/json; UTF-8',
-          }, status: 200)
-        site = {
-          'name' => "Some JSON Response",
-          'expected_update_period_in_days' => "2",
-          'type' => "json",
-          'url' => "http://wrong-encoding.example.com",
-          'mode' => 'on_change',
-          'extract' => {
-            'value' => { 'path' => 'value' },
-          },
-          'force_encoding' => 'EUC-JP',
-        }
-        checker = Agents::WebsiteAgent.new(name: "Wrong Encoding Site", options: site)
-        checker.user = users(:bob)
-        checker.save!
-
-        expect { checker.check }.to change { Event.count }.by(1)
-        event = Event.last
-        expect(event.payload['value']).to eq(huginn)
+      let :odin do
+        "\u{d3}\u{f0}inn"
       end
 
-      it 'should be determined by charset in Content-Type' do
-        huginn = "\u{601d}\u{8003}"
-        stub_request(:any, /charset-euc-jp/).to_return(body: {
-            value: huginn,
-          }.to_json.encode(Encoding::EUC_JP), headers: {
-            'Content-Type' => 'application/json; charset=EUC-JP',
-          }, status: 200)
-        site = {
-          'name' => "Some JSON Response",
-          'expected_update_period_in_days' => "2",
-          'type' => "json",
-          'url' => "http://charset-euc-jp.example.com",
-          'mode' => 'on_change',
-          'extract' => {
-            'value' => { 'path' => 'value' },
-          },
-        }
-        checker = Agents::WebsiteAgent.new(name: "Charset reader", options: site)
-        checker.user = users(:bob)
-        checker.save!
-
-        expect { checker.check }.to change { Event.count }.by(1)
-        event = Event.last
-        expect(event.payload['value']).to eq(huginn)
+      let :url do
+        'http://encoding-test.example.com/'
       end
 
-      it 'should default to UTF-8 when unknown charset is found' do
-        huginn = "\u{601d}\u{8003}"
-        stub_request(:any, /charset-unknown/).to_return(body: {
-            value: huginn,
-          }.to_json.b, headers: {
-            'Content-Type' => 'application/json; charset=unicode',
-          }, status: 200)
-        site = {
-          'name' => "Some JSON Response",
-          'expected_update_period_in_days' => "2",
-          'type' => "json",
-          'url' => "http://charset-unknown.example.com",
-          'mode' => 'on_change',
-          'extract' => {
-            'value' => { 'path' => 'value' },
-          },
-        }
-        checker = Agents::WebsiteAgent.new(name: "Charset reader", options: site)
-        checker.user = users(:bob)
-        checker.save!
+      let :content_type do
+        raise 'define me'
+      end
 
-        expect { checker.check }.to change { Event.count }.by(1)
-        event = Event.last
-        expect(event.payload['value']).to eq(huginn)
+      let :body do
+        raise 'define me'
+      end
+
+      before do
+        stub_request(:any, url).to_return(
+          headers: {
+            'Content-Type' => content_type,
+          },
+          body: body.b,
+          status: 200)
+      end
+
+      let :options do
+        {
+          'name' => 'Some agent',
+          'expected_update_period_in_days' => '2',
+          'url' => url,
+          'mode' => 'on_change',
+        }
+      end
+
+      let :checker do
+        Agents::WebsiteAgent.create!(name: 'Encoding Checker', options: options) { |agent|
+          agent.user = users(:bob)
+        }
+      end
+
+      context 'with no encoding information' do
+        context 'for a JSON file' do
+          let :content_type do
+            'application/json'
+          end
+
+          let :body do
+            {
+              value: huginn,
+            }.to_json
+          end
+
+          let :options do
+            super().merge(
+              'type' => 'json',
+              'extract' => {
+                'value' => { 'path' => 'value' }
+              }
+            )
+          end
+
+          it 'should be assumed to be UTF-8' do
+            expect { checker.check }.to change { Event.count }.by(1)
+            event = Event.last
+            expect(event.payload['value']).to eq(huginn)
+          end
+        end
+
+        context 'for an HTML file' do
+          let :content_type do
+            'text/html'
+          end
+
+          let :options do
+            super().merge(
+              'type' => 'html',
+              'extract' => {
+                'value' => { 'css' => 'title', 'value' => 'string(.)' }
+              }
+            )
+          end
+
+          context 'with a charset in the header' do
+            let :content_type do
+              super() + '; charset=iso-8859-1'
+            end
+
+            let :body do
+              <<-HTML.encode(Encoding::ISO_8859_1)
+<!DOCTYPE html>
+<title>#{odin}</title>
+<p>Hello, world.
+              HTML
+            end
+
+            it 'should be detected from it' do
+              expect { checker.check }.to change { Event.count }.by(1)
+              event = Event.last
+              expect(event.payload['value']).to eq(odin)
+            end
+          end
+
+          context 'with no charset in the header' do
+            let :body do
+              <<-HTML.encode(Encoding::ISO_8859_1)
+<!DOCTYPE html>
+<meta charset="iso-8859-1">
+<title>#{odin}</title>
+<p>Hello, world.
+              HTML
+            end
+
+            it 'should be detected from a meta tag' do
+              expect { checker.check }.to change { Event.count }.by(1)
+              event = Event.last
+              expect(event.payload['value']).to eq(odin)
+            end
+          end
+
+          context 'with charset desclarations both in the header and in the content' do
+            let :content_type do
+              super() + '; charset=iso-8859-1'
+            end
+
+            let :body do
+              <<-HTML.encode(Encoding::ISO_8859_1)
+<!DOCTYPE html>
+<meta charset="UTF-8">
+<title>#{odin}</title>
+<p>Hello, world.
+              HTML
+            end
+
+            it 'should be detected as that of the header' do
+              expect { checker.check }.to change { Event.count }.by(1)
+              event = Event.last
+              expect(event.payload['value']).to eq(odin)
+            end
+          end
+        end
+
+        context 'for an XML file' do
+          let :content_type do
+            'application/xml'
+          end
+
+          let :options do
+            super().merge(
+              'type' => 'xml',
+              'extract' => {
+                'value' => { 'xpath' => '/root/message', 'value' => 'string(.)' }
+              }
+            )
+          end
+
+          context 'with a charset in the header' do
+            let :content_type do
+              super() + '; charset=euc-jp'
+            end
+
+            let :body do
+              <<-XML.encode(Encoding::EUC_JP)
+<?xml version="1.0"?>
+<root>
+  <message>#{huginn}</message>
+</root>
+              XML
+            end
+
+            it 'should be detected from it' do
+              expect { checker.check }.to change { Event.count }.by(1)
+              event = Event.last
+              expect(event.payload['value']).to eq(huginn)
+            end
+          end
+
+          context 'with no charset in the header' do
+            context 'but in XML declaration' do
+              let :body do
+                <<-XML.encode(Encoding::EUC_JP)
+<?xml version="1.0" encoding="euc-jp"?>
+<root>
+  <message>#{huginn}</message>
+</root>
+                XML
+              end
+
+              it 'should be detected' do
+                expect { checker.check }.to change { Event.count }.by(1)
+                event = Event.last
+                expect(event.payload['value']).to eq(huginn)
+              end
+            end
+
+            context 'but having a BOM' do
+              let :body do
+                <<-XML.encode(Encoding::UTF_16LE)
+\u{feff}<?xml version="1.0"?>
+<root>
+  <message>#{huginn}</message>
+</root>
+                XML
+              end
+
+              it 'should be detected' do
+                expect { checker.check }.to change { Event.count }.by(1)
+                event = Event.last
+                expect(event.payload['value']).to eq(huginn)
+              end
+            end
+          end
+        end
+      end
+
+      context 'when force_encoding option is specified' do
+        let :options do
+          super().merge(
+            'force_encoding' => 'EUC-JP'
+          )
+        end
+
+        context 'for a JSON file' do
+          let :content_type do
+            'application/json'
+          end
+
+          let :body do
+            {
+              value: huginn,
+            }.to_json.encode(Encoding::EUC_JP)
+          end
+
+          let :options do
+            super().merge(
+              'type' => 'json',
+              'extract' => {
+                'value' => { 'path' => 'value' }
+              }
+            )
+          end
+
+          it 'should be forced' do
+            expect { checker.check }.to change { Event.count }.by(1)
+            event = Event.last
+            expect(event.payload['value']).to eq(huginn)
+          end
+        end
+
+        context 'for an HTML file' do
+          let :content_type do
+            'text/html'
+          end
+
+          context 'with charset specified in the header and the content' do
+            let :content_type do
+              super() + '; charset=UTF-8'
+            end
+
+            let :body do
+              <<-HTML.encode(Encoding::EUC_JP)
+<!DOCTYPE html>
+<meta charset="UTF-8"/>
+<title>#{huginn}</title>
+<p>Hello, world.
+              HTML
+            end
+
+            let :options do
+              super().merge(
+                'type' => 'html',
+                'extract' => {
+                  'value' => { 'css' => 'title', 'value' => 'string(.)' }
+                }
+              )
+            end
+
+            it 'should still be forced' do
+              expect { checker.check }.to change { Event.count }.by(1)
+              event = Event.last
+              expect(event.payload['value']).to eq(huginn)
+            end
+          end
+        end
       end
     end