From 5eeca709c7475d635e749e610fdc58499d0a17c1 Mon Sep 17 00:00:00 2001
From: Andrew Cantino <cantino@users.noreply.github.com>
Date: Tue, 24 Nov 2015 12:34:20 -0800
Subject: [PATCH] Add event_data_path option to the WebsiteAgent to handle data
 directly in incoming Events

---
 app/models/agents/website_agent.rb       | 110 ++++++----
 spec/models/agents/website_agent_spec.rb | 250 ++++++++++++++++-------
 2 files changed, 240 insertions(+), 120 deletions(-)

diff --git a/app/models/agents/website_agent.rb b/app/models/agents/website_agent.rb
index 468b40b3..d0ef15b2 100644
--- a/app/models/agents/website_agent.rb
+++ b/app/models/agents/website_agent.rb
@@ -20,7 +20,12 @@ module Agents
 
       `url` can be a single url, or an array of urls (for example, for multiple pages with the exact same structure but different content to scrape)
 
-      The WebsiteAgent can also scrape based on incoming events. It will scrape the url contained in the `url` key of the incoming event payload, or if you set `url_from_event` it is used as a Liquid template to generate the url to access. If you specify `merge` as the `mode`, it will retain the old payload and update it with the new values.
+      The WebsiteAgent can also scrape based on incoming events.
+
+      * If the Event contains a `url` key, that URL will be fetched.
+      * For more control, you can set the `url_from_event` option and it will be used as a Liquid template to generate the url to access based on the Event.
+      * If you set `event_data_path` to the [JSONPath](http://goessner.net/articles/JsonPath/) of content in the Event, that will be used directly without fetching any URL.
+      * If you specify `merge` for the `mode` option, Huginn will retain the old payload and update it with the new values.
 
       # Supported Document Types
 
@@ -140,7 +145,7 @@ module Agents
 
     def validate_options
       # Check for required fields
-      errors.add(:base, "either url or url_from_event is required") unless options['url'].present? || options['url_from_event'].present?
+      errors.add(:base, "either url, url_from_event, or event_data_path are required") unless options['url'].present? || options['url_from_event'].present? || options['event_data_path'].present?
       errors.add(:base, "expected_update_period_in_days is required") unless options['expected_update_period_in_days'].present?
       validate_extract_options!
 
@@ -251,15 +256,15 @@ module Agents
       check_urls(interpolated['url'])
     end
 
-    def check_urls(in_url, payload = {})
+    def check_urls(in_url, existing_payload = {})
       return unless in_url.present?
 
       Array(in_url).each do |url|
-        check_url(url, payload)
+        check_url(url, existing_payload)
       end
     end
 
-    def check_url(url, payload = {})
+    def check_url(url, existing_payload = {})
       unless /\Ahttps?:\/\//i === url
         error "Ignoring a non-HTTP url: #{url.inspect}"
         return
@@ -271,70 +276,91 @@ module Agents
 
       interpolation_context.stack {
         interpolation_context['_response_'] = ResponseDrop.new(response)
-        body = response.body
-        doc = parse(body)
+        handle_data(response.body, response.env[:url], existing_payload)
+      }
+    rescue => e
+      error "Error when fetching url: #{e.message}\n#{e.backtrace.join("\n")}"
+    end
 
-        if extract_full_json?
-          if store_payload!(previous_payloads(1), doc)
-            log "Storing new result for '#{name}': #{doc.inspect}"
-            create_event payload: payload.merge(doc)
-          end
-          return
+    def handle_data(body, url, existing_payload)
+      doc = parse(body)
+
+      if extract_full_json?
+        if store_payload!(previous_payloads(1), doc)
+          log "Storing new result for '#{name}': #{doc.inspect}"
+          create_event payload: existing_payload.merge(doc)
         end
+        return
+      end
 
-        output =
-          case extraction_type
+      output =
+        case extraction_type
           when 'json'
             extract_json(doc)
           when 'text'
             extract_text(doc)
           else
             extract_xml(doc)
-          end
-
-        num_unique_lengths = interpolated['extract'].keys.map { |name| output[name].length }.uniq
-
-        if num_unique_lengths.length != 1
-          raise "Got an uneven number of matches for #{interpolated['name']}: #{interpolated['extract'].inspect}"
         end
 
-        old_events = previous_payloads num_unique_lengths.first
-        num_unique_lengths.first.times do |index|
-          result = {}
-          interpolated['extract'].keys.each do |name|
-            result[name] = output[name][index]
-            if name.to_s == 'url'
-              result[name] = (response.env[:url] + Utils.normalize_uri(result[name])).to_s
-            end
-          end
+      num_unique_lengths = interpolated['extract'].keys.map { |name| output[name].length }.uniq
 
-          if store_payload!(old_events, result)
-            log "Storing new parsed result for '#{name}': #{result.inspect}"
-            create_event payload: payload.merge(result)
+      if num_unique_lengths.length != 1
+        raise "Got an uneven number of matches for #{interpolated['name']}: #{interpolated['extract'].inspect}"
+      end
+
+      old_events = previous_payloads num_unique_lengths.first
+      num_unique_lengths.first.times do |index|
+        result = {}
+        interpolated['extract'].keys.each do |name|
+          result[name] = output[name][index]
+          if name.to_s == 'url' && url.present?
+            result[name] = (url + Utils.normalize_uri(result[name])).to_s
           end
         end
-      }
-    rescue => e
-      error "Error when fetching url: #{e.message}\n#{e.backtrace.join("\n")}"
+
+        if store_payload!(old_events, result)
+          log "Storing new parsed result for '#{name}': #{result.inspect}"
+          create_event payload: existing_payload.merge(result)
+        end
+      end
     end
 
     def receive(incoming_events)
       incoming_events.each do |event|
         interpolate_with(event) do
-          url_to_scrape =
-            if url_template = options['url_from_event'].presence
-              interpolate_options(url_template)
+          existing_payload = interpolated['mode'].to_s == "merge" ? event.payload : {}
+
+          if event_data_path = options['event_data_path'].presence
+            data = Utils.value_at(event.payload, interpolate_options(event_data_path))
+            if data.present?
+              handle_event_data(data, event, existing_payload)
             else
-              event.payload['url']
+              error "No data was found in the Event payload at the JSONPath #{interpolate_options(event_data_path)}", inbound_event: event
             end
-          check_urls(url_to_scrape,
-                    interpolated['mode'].to_s == "merge" ? event.payload : {})
+          else
+            url_to_scrape =
+              if event_data_path = options['event_data_path'].presence
+                interpolate_options(event_data_path)
+              elsif url_template = options['url_from_event'].presence
+                interpolate_options(url_template)
+              else
+                event.payload['url']
+              end
+            check_urls(url_to_scrape, existing_payload)
+          end
         end
       end
     end
 
     private
 
+    def handle_event_data(data, event, existing_payload)
+      handle_data(data, event.payload['url'], existing_payload)
+    rescue => e
+      error "Error when handling event data: #{e.message}\n#{e.backtrace.join("\n")}", inbound_event: event
+    end
+
     # This method returns true if the result should be stored as a new event.
     # If mode is set to 'on_change', this method may return false and update an existing
     # event to expire further in the future.
diff --git a/spec/models/agents/website_agent_spec.rb b/spec/models/agents/website_agent_spec.rb
index 1f60d45f..7dcc766d 100644
--- a/spec/models/agents/website_agent_spec.rb
+++ b/spec/models/agents/website_agent_spec.rb
@@ -763,92 +763,186 @@ fire: hot
     end
 
     describe "#receive" do
-      before do
-        @event = Event.new
-        @event.agent = agents(:bob_rain_notifier_agent)
-        @event.payload = {
-          'url' => 'http://xkcd.com',
-          'link' => 'Random',
-        }
-      end
-
-      it "should scrape from the url element in incoming event payload" do
-        expect {
-          @checker.options = @valid_options
-          @checker.receive([@event])
-        }.to change { Event.count }.by(1)
-      end
-
-      it "should use url_from_event as url to scrape if it exists when receiving an event" do
-        stub = stub_request(:any, 'http://example.org/?url=http%3A%2F%2Fxkcd.com')
-
-        @checker.options = @valid_options.merge(
-          'url_from_event' => 'http://example.org/?url={{url | uri_escape}}'
-        )
-        @checker.receive([@event])
-
-        expect(stub).to have_been_requested
-      end
-
-      it "should allow url_from_event to be an array of urls" do
-        stub1 = stub_request(:any, 'http://example.org/?url=http%3A%2F%2Fxkcd.com')
-        stub2 = stub_request(:any, 'http://google.org/?url=http%3A%2F%2Fxkcd.com')
-
-        @checker.options = @valid_options.merge(
-          'url_from_event' => ['http://example.org/?url={{url | uri_escape}}', 'http://google.org/?url={{url | uri_escape}}']
-        )
-        @checker.receive([@event])
-
-        expect(stub1).to have_been_requested
-        expect(stub2).to have_been_requested
-      end
-
-      it "should interpolate values from incoming event payload" do
-        expect {
-          @valid_options['extract'] = {
-            'from' => {
-              'xpath' => '*[1]',
-              'value' => '{{url | to_xpath}}'
-            },
-            'to' => {
-              'xpath' => '(//a[@href and text()={{link | to_xpath}}])[1]',
-              'value' => '@href'
-            },
+      describe "with a url or url_from_event" do
+        before do
+          @event = Event.new
+          @event.agent = agents(:bob_rain_notifier_agent)
+          @event.payload = {
+            'url' => 'http://xkcd.com',
+            'link' => 'Random',
           }
-          @checker.options = @valid_options
-          @checker.receive([@event])
-        }.to change { Event.count }.by(1)
+        end
 
-        expect(Event.last.payload).to eq({
-          'from' => 'http://xkcd.com',
-          'to' => 'http://dynamic.xkcd.com/random/comic/',
-        })
+        it "should scrape from the url element in incoming event payload" do
+          expect {
+            @checker.options = @valid_options
+            @checker.receive([@event])
+          }.to change { Event.count }.by(1)
+        end
+
+        it "should use url_from_event as url to scrape if it exists when receiving an event" do
+          stub = stub_request(:any, 'http://example.org/?url=http%3A%2F%2Fxkcd.com')
+
+          @checker.options = @valid_options.merge(
+            'url_from_event' => 'http://example.org/?url={{url | uri_escape}}'
+          )
+          @checker.receive([@event])
+
+          expect(stub).to have_been_requested
+        end
+
+        it "should allow url_from_event to be an array of urls" do
+          stub1 = stub_request(:any, 'http://example.org/?url=http%3A%2F%2Fxkcd.com')
+          stub2 = stub_request(:any, 'http://google.org/?url=http%3A%2F%2Fxkcd.com')
+
+          @checker.options = @valid_options.merge(
+            'url_from_event' => ['http://example.org/?url={{url | uri_escape}}', 'http://google.org/?url={{url | uri_escape}}']
+          )
+          @checker.receive([@event])
+
+          expect(stub1).to have_been_requested
+          expect(stub2).to have_been_requested
+        end
+
+        it "should interpolate values from incoming event payload" do
+          expect {
+            @valid_options['extract'] = {
+              'from' => {
+                'xpath' => '*[1]',
+                'value' => '{{url | to_xpath}}'
+              },
+              'to' => {
+                'xpath' => '(//a[@href and text()={{link | to_xpath}}])[1]',
+                'value' => '@href'
+              },
+            }
+            @checker.options = @valid_options
+            @checker.receive([@event])
+          }.to change { Event.count }.by(1)
+
+          expect(Event.last.payload).to eq({
+            'from' => 'http://xkcd.com',
+            'to' => 'http://dynamic.xkcd.com/random/comic/',
+          })
+        end
+
+        it "should interpolate values from incoming event payload and _response_" do
+          @event.payload['title'] = 'XKCD'
+
+          expect {
+            @valid_options['extract'] = {
+              'response_info' => @valid_options['extract']['url'].merge(
+                'value' => '{% capture sentence %}The reponse from {{title}} was {{_response_.status}} {{_response_.headers.X-Status-Message}}.{% endcapture %}{{sentence | to_xpath}}'
+              )
+            }
+            @checker.options = @valid_options
+            @checker.receive([@event])
+          }.to change { Event.count }.by(1)
+
+          expect(Event.last.payload['response_info']).to eq('The reponse from XKCD was 200 OK.')
+        end
+
+        it "should support merging of events" do
+          expect {
+            @checker.options = @valid_options
+            @checker.options[:mode] = "merge"
+            @checker.receive([@event])
+          }.to change { Event.count }.by(1)
+          last_payload = Event.last.payload
+          expect(last_payload['link']).to eq('Random')
+        end
       end
 
-      it "should interpolate values from incoming event payload and _response_" do
-        @event.payload['title'] = 'XKCD'
+      describe "with a event_data_path" do
+        describe "with json data" do
+          before do
+            @event = Event.new
+            @event.agent = agents(:bob_rain_notifier_agent)
+            @event.payload = {
+              'something' => 'some value',
+              'some_object' => {
+                'some_data' => { hello: 'world' }.to_json
+              }
+            }
+            @event.save!
 
-        expect {
-          @valid_options['extract'] = {
-            'response_info' => @valid_options['extract']['url'].merge(
-              'value' => '{% capture sentence %}The reponse from {{title}} was {{_response_.status}} {{_response_.headers.X-Status-Message}}.{% endcapture %}{{sentence | to_xpath}}'
+            @checker.options = @valid_options.merge(
+              'type' => 'json',
+              'event_data_path' => 'some_object.some_data',
+              'extract' => {
+                'value' => { 'path' => 'hello' }
+              }
             )
-          }
-          @checker.options = @valid_options
-          @checker.receive([@event])
-        }.to change { Event.count }.by(1)
+          end
 
-        expect(Event.last.payload['response_info']).to eq('The reponse from XKCD was 200 OK.')
-      end
+          it "should extract from the event data in the incoming event payload" do
+            expect {
+              @checker.receive([@event])
+            }.to change { Event.count }.by(1)
+            expect(@checker.events.last.payload).to eq({ 'value' => 'world' })
+          end
 
-      it "should support merging of events" do
-        expect {
-          @checker.options = @valid_options
-          @checker.options[:mode] = "merge"
-          @checker.receive([@event])
-        }.to change { Event.count }.by(1)
-        last_payload = Event.last.payload
-        expect(last_payload['link']).to eq('Random')
+          it "should support merge mode" do
+            @checker.options['mode'] = "merge"
+
+            expect {
+              @checker.receive([@event])
+            }.to change { Event.count }.by(1)
+            expect(@checker.events.last.payload).to eq(@event.payload.merge('value' => 'world'))
+          end
+
+          it "should output an error when nothing can be found at the path" do
+            @checker.options = @checker.options.merge(
+              'event_data_path' => 'some_object.mistake'
+            )
+
+            expect {
+              @checker.receive([@event])
+            }.to_not change { Event.count }
+
+            expect(@checker.logs.last.message).to match(/No data was found in the Event payload at the JSONPath some_object.mistake/)
+          end
+
+          it "should output an error when the data cannot be parsed" do
+            @event.update_attribute :payload, @event.payload.merge('some_object' => { 'some_data' => '{invalid json' })
+
+            expect {
+              @checker.receive([@event])
+            }.to_not change { Event.count }
+
+            expect(@checker.logs.last.message).to match(/Error when handling event data:/)
+          end
+        end
+
+        describe "with HTML data" do
+          before do
+            @event = Event.new
+            @event.agent = agents(:bob_rain_notifier_agent)
+            @event.payload = {
+              'url' => 'http://xkcd.com',
+              'some_object' => {
+                'some_data' => "<div><span class='title'>Title!</span><span class='body'>Body!</span></div>"
+              }
+            }
+            @event.save!
+
+            @checker.options = @valid_options.merge(
+              'type' => 'html',
+              'event_data_path' => 'some_object.some_data',
+              'extract' => {
+                'title' => { 'css' => ".title", 'value' => ".//text()" },
+                'body' => { 'css' => "div span.body", 'value' => ".//text()" }
+              }
+            )
+          end
+
+          it "should extract from the event data in the incoming event payload" do
+            expect {
+              @checker.receive([@event])
+            }.to change { Event.count }.by(1)
+            expect(@checker.events.last.payload).to eq({ 'title' => 'Title!', 'body' => 'Body!' })
+          end
+        end
       end
     end
   end