Merge pull request #1743 from cantino/website_agent_can_interpolate_after_extraction

WebsiteAgent can interpolate after extraction Incorporating feedback from @cantino and @dsander.
2025-03-15 19:31:26 +00:00 · 2016-11-01 20:20:37 +09:00 · 2016-11-01 20:20:37 +09:00 · 91f096b16f
commit 91f096b16f
parent e3f1429a37 58fabb885c
6 changed files with 222 additions and 15 deletions
--- a/app/concerns/liquid_interpolatable.rb
+++ b/app/concerns/liquid_interpolatable.rb
@ -189,6 +189,11 @@ module LiquidInterpolatable
      url
    end

+    # Rebase URIs contained in attributes in a given HTML fragment
+    def rebase_hrefs(input, base_uri)
+      Utils.rebase_hrefs(input, base_uri) rescue input
+    end
+
    # Unescape (basic) HTML entities in a string
    #
    # This currently decodes the following entities only: "&apos;",
--- a/app/models/agents/website_agent.rb
+++ b/app/models/agents/website_agent.rb
@ -111,9 +111,22 @@ module Agents

      Set `http_success_codes` to an array of status codes (e.g., `[404, 422]`) to treat HTTP response codes beyond 200 as successes.

+      If a `template` option is given, it is used as a Liquid template for each event created by this Agent, instead of directly emitting the results of extraction as events.  In the template, keys of extracted data can be interpolated, and some additional variables are also available as explained in the next section.  For example:
+
+          "template": {
+            "url": "{{ url }}",
+            "title": "{{ title }}",
+            "description": "{{ body_text }}",
+            "last_modified": "{{ _response_.headers.Last-Modified | date: '%FT%T' }}"
+          }
+
+      In the `on_change` mode, change is detected based on the resulted event payload after applying this option.  If you want to add some keys to each event but ignore any change in them, set `mode` to `all` and put a DeDuplicationAgent downstream.
+
      # Liquid Templating

-      In Liquid templating, the following variable is available:
+      In Liquid templating, the following variables are available except when invoked by `data_from_event`:
+
+      * `_url_`: The URL specified to fetch the content from.

      * `_response_`: A response object with the following keys:

@ -121,14 +134,18 @@ module Agents

          * `headers`: Response headers; for example, `{{ _response_.headers.Content-Type }}` expands to the value of the Content-Type header.  Keys are insensitive to cases and -/_.

+          * `url`: The final URL of the fetched page, following redirects.  Using this in the `template` option, you can resolve relative URLs extracted from a document like `{{ link | to_uri: _request_.url }}` and `{{ content | rebase_hrefs: _request_.url }}`.
+
      # Ordering Events

      #{description_events_order}
    MD

    event_description do
+      keys = options['template'].presence || options['extract'].keys
+
      "Events will have the following fields:\n\n    %s" % [
-        Utils.pretty_print(Hash[options['extract'].keys.map { |key|
+        Utils.pretty_print(Hash[keys.map { |key|
          [key, "..."]
        }])
      ]
@ -157,6 +174,7 @@ module Agents
      errors.add(:base, "either url, url_from_event, or data_from_event are required") unless options['url'].present? || options['url_from_event'].present? || options['data_from_event'].present?
      errors.add(:base, "expected_update_period_in_days is required") unless options['expected_update_period_in_days'].present?
      validate_extract_options!
+      validate_template_options!
      validate_http_success_codes!

      # Check for optional fields
@ -281,6 +299,15 @@ module Agents
      end
    end

+    def validate_template_options!
+      template = options['template'].presence or return
+
+      unless Hash === template &&
+             template.each_pair.all? { |key, value| String === value }
+        errors.add(:base, 'template must be a hash of strings.')
+      end
+    end
+
    def check
      check_urls(interpolated['url'])
    end
@ -305,6 +332,7 @@ module Agents
      raise "Failed: #{response.inspect}" unless consider_response_successful?(response)

      interpolation_context.stack {
+        interpolation_context['_url_'] = uri.to_s
        interpolation_context['_response_'] = ResponseDrop.new(response)
        handle_data(response.body, response.env[:url], existing_payload)
      }
@ -343,20 +371,33 @@ module Agents
            extract_xml(doc)
        end

-      num_unique_lengths = interpolated['extract'].keys.map { |name| output[name].length }.uniq
-
-      if num_unique_lengths.length != 1
+      if output.each_value.each_cons(2).any? { |m, n| m.size != n.size }
        raise "Got an uneven number of matches for #{interpolated['name']}: #{interpolated['extract'].inspect}"
      end

-      old_events = previous_payloads num_unique_lengths.first
-      num_unique_lengths.first.times do |index|
-        result = {}
-        interpolated['extract'].keys.each do |name|
-          result[name] = output[name][index]
-          if name.to_s == 'url' && url.present?
-            result[name] = (url + Utils.normalize_uri(result[name])).to_s
+      num_tuples = output.each_value.first.size
+
+      old_events = previous_payloads num_tuples
+
+      template = options['template'].presence
+
+      num_tuples.times do |index|
+        extracted = {}
+        interpolated['extract'].each_key do |name|
+          extracted[name] = output[name][index]
+        end
+
+        result =
+          if template
+            interpolate_with(extracted) do
+              interpolate_options(template)
+            end
+          else
+            extracted
          end
+
+        if payload_url = result['url'].presence
+          result['url'] = (url + Utils.normalize_uri(payload_url)).to_s
        end

        if store_payload!(old_events, result)
@ -567,6 +608,11 @@ module Agents
      def status
        @object.status
      end
+
+      # The URL
+      def url
+        @object.env.url.to_s
+      end
    end

    # Wraps Faraday::Utils::Headers
--- a/lib/utils.rb
+++ b/lib/utils.rb
@ -170,4 +170,93 @@ module Utils
      nil
    end
  end
+
+  module HTMLTransformer
+    SINGLE = 1
+    MULTIPLE = 2
+    COMMA_SEPARATED = 3
+    SRCSET = 4
+
+    URI_ATTRIBUTES = {
+      'a' => { 'href' => SINGLE },
+      'applet' => { 'archive' => COMMA_SEPARATED, 'codebase' => SINGLE },
+      'area' => { 'href' => SINGLE },
+      'audio' => { 'src' => SINGLE },
+      'base' => { 'href' => SINGLE },
+      'blockquote' => { 'cite' => SINGLE },
+      'body' => { 'background' => SINGLE },
+      'button' => { 'formaction' => SINGLE },
+      'command' => { 'icon' => SINGLE },
+      'del' => { 'cite' => SINGLE },
+      'embed' => { 'src' => SINGLE },
+      'form' => { 'action' => SINGLE },
+      'frame' => { 'longdesc' => SINGLE, 'src' => SINGLE },
+      'head' => { 'profile' => SINGLE },
+      'html' => { 'manifest' => SINGLE },
+      'iframe' => { 'longdesc' => SINGLE, 'src' => SINGLE },
+      'img' => { 'longdesc' => SINGLE, 'src' => SINGLE, 'srcset' => SRCSET, 'usemap' => SINGLE },
+      'input' => { 'formaction' => SINGLE, 'src' => SINGLE, 'usemap' => SINGLE },
+      'ins' => { 'cite' => SINGLE },
+      'link' => { 'href' => SINGLE },
+      'object' => { 'archive' => MULTIPLE, 'classid' => SINGLE, 'codebase' => SINGLE, 'data' => SINGLE, 'usemap' => SINGLE },
+      'q' => { 'cite' => SINGLE },
+      'script' => { 'src' => SINGLE },
+      'source' => { 'src' => SINGLE, 'srcset' => SRCSET },
+      'video' => { 'poster' => SINGLE, 'src' => SINGLE },
+    }
+
+    URI_ELEMENTS_XPATH = '//*[%s]' % URI_ATTRIBUTES.keys.map { |name| "name()='#{name}'" }.join(' or ')
+
+    module_function
+
+    def transform(html, &block)
+      block or raise ArgumentError, 'block must be given'
+
+      case html
+      when /\A\s*(?:<\?xml[\s?]|<!DOCTYPE\s)/i
+        doc = Nokogiri.parse(html)
+        yield doc
+        doc.to_s
+      when /\A\s*<(html|head|body)[\s>]/i
+        # Libxml2 automatically adds DOCTYPE and <html>, so we need to
+        # skip them.
+        element_name = $1
+        doc = Nokogiri::HTML::Document.parse(html)
+        yield doc
+        doc.at_xpath("//#{element_name}").xpath('self::node() | following-sibling::node()').to_s
+      else
+        doc = Nokogiri::HTML::Document.parse("<html><body>#{html}")
+        yield doc
+        doc.xpath("/html/body/node()").to_s
+      end
+    end
+
+    def replace_uris(html, &block)
+      block or raise ArgumentError, 'block must be given'
+
+      transform(html) { |doc|
+        doc.xpath(URI_ELEMENTS_XPATH).each { |element|
+          uri_attrs = URI_ATTRIBUTES[element.name] or next
+          uri_attrs.each { |name, format|
+            attr = element.attribute(name) or next
+            case format
+            when SINGLE
+              attr.value = block.call(attr.value.strip)
+            when MULTIPLE
+              attr.value = attr.value.gsub(/(\S+)/) { block.call($1) }
+            when COMMA_SEPARATED, SRCSET
+              attr.value = attr.value.gsub(/((?:\A|,)\s*)(\S+)/) { $1 + block.call($2) }
+            end
+          }
+        }
+      }
+    end
+  end
+
+  def self.rebase_hrefs(html, base_uri)
+    base_uri = normalize_uri(base_uri)
+    HTMLTransformer.replace_uris(html) { |url|
+      base_uri.merge(normalize_uri(url)).to_s
+    }
+  end
 end
--- a/spec/concerns/liquid_interpolatable_spec.rb
+++ b/spec/concerns/liquid_interpolatable_spec.rb
@ -323,4 +323,42 @@ describe LiquidInterpolatable::Filters do
      end
    end
  end
+
+  describe 'rebase_hrefs' do
+    let(:agent) { Agents::InterpolatableAgent.new(name: "test") }
+
+    let(:fragment) { <<HTML }
+<ul>
+  <li>
+    <a href="downloads/file1"><img src="/images/iconA.png" srcset="/images/iconA.png 1x, /images/iconA@2x.png 2x">file1</a>
+  </li>
+  <li>
+    <a href="downloads/file2"><img src="/images/iconA.png" srcset="/images/iconA.png 1x, /images/iconA@2x.png 2x">file2</a>
+  </li>
+  <li>
+    <a href="downloads/file3"><img src="/images/iconB.png" srcset="/images/iconB.png 1x, /images/iconB@2x.png 2x">file3</a>
+  </li>
+</ul>
+HTML
+
+    let(:replaced_fragment) { <<HTML }
+<ul>
+  <li>
+    <a href="http://example.com/support/downloads/file1"><img src="http://example.com/images/iconA.png" srcset="http://example.com/images/iconA.png 1x, http://example.com/images/iconA@2x.png 2x">file1</a>
+  </li>
+  <li>
+    <a href="http://example.com/support/downloads/file2"><img src="http://example.com/images/iconA.png" srcset="http://example.com/images/iconA.png 1x, http://example.com/images/iconA@2x.png 2x">file2</a>
+  </li>
+  <li>
+    <a href="http://example.com/support/downloads/file3"><img src="http://example.com/images/iconB.png" srcset="http://example.com/images/iconB.png 1x, http://example.com/images/iconB@2x.png 2x">file3</a>
+  </li>
+</ul>
+HTML
+
+    it 'rebases relative URLs in a fragment' do
+      agent.interpolation_context['content'] = fragment
+      agent.options['template'] = "{{ content | rebase_hrefs: 'http://example.com/support/files.html' }}"
+      expect(agent.interpolated['template']).to eq(replaced_fragment)
+    end
+  end
 end
--- a/spec/models/agents/website_agent_spec.rb
+++ b/spec/models/agents/website_agent_spec.rb
@ -8,6 +8,10 @@ describe Agents::WebsiteAgent do
                                           headers: {
                                             'X-Status-Message' => 'OK'
                                           })
+      stub_request(:any, /xkcd\.com\/index$/).to_return(status: 301,
+                                                        headers: {
+                                                          'Location' => 'http://xkcd.com/'
+                                                        })
      @valid_options = {
        'name' => "XKCD",
        'expected_update_period_in_days' => "2",
@ -729,14 +733,36 @@ describe Agents::WebsiteAgent do
      end

      it "should interpolate _response_" do
+        @valid_options['url'] = 'http://xkcd.com/index'
        @valid_options['extract']['response_info'] =
          @valid_options['extract']['url'].merge(
-            'value' => '"{{ "The reponse was " | append:_response_.status | append:" " | append:_response_.headers.X-Status-Message | append:"." }}"'
+            'value' => '{{ "The reponse from " | append:_response_.url | append:" was " | append:_response_.status | append:" " | append:_response_.headers.X-Status-Message | append:"." | to_xpath }}'
+          )
+        @valid_options['extract']['original_url'] =
+          @valid_options['extract']['url'].merge(
+            'value' => '{{ _url_ | to_xpath }}'
          )
        @checker.options = @valid_options
        @checker.check
        event = Event.last
-        expect(event.payload['response_info']).to eq('The reponse was 200 OK.')
+        expect(event.payload['response_info']).to eq('The reponse from http://xkcd.com/ was 200 OK.')
+        expect(event.payload['original_url']).to eq('http://xkcd.com/index')
+      end
+
+      it "should be formatted by template after extraction" do
+        @valid_options['template'] = {
+          'url' => '{{url}}',
+          'title' => '{{title | upcase}}',
+          'summary' => '{{title}}: {{hovertext | truncate: 20}}',
+        }
+        @checker.options = @valid_options
+        @checker.check
+        event = Event.last
+        expect(event.payload).to eq({
+                                      'title' => 'EVOLVING',
+                                      'url' => 'http://imgs.xkcd.com/comics/evolving.png',
+                                      'summary' => 'Evolving: Biologists play r...',
+                                    })
      end

      describe "XML" do
--- a/spec/support/shared_examples/agent_controller_concern.rb
+++ b/spec/support/shared_examples/agent_controller_concern.rb
@ -130,7 +130,10 @@ shared_examples_for AgentControllerConcern do
    end

    it "should configure targets with nested objects" do
-      agent.control_targets << agents(:bob_data_output_agent)
+      agent.control_targets = [
+        agents(:bob_basecamp_agent),  # does not support a `template` option, but anyway
+        agents(:bob_data_output_agent)
+      ]
      agent.options['action'] = 'configure'
      agent.options['configure_options'] = { 
        template: {