From b876759b7b426dd7a1598c4c0ff04b41be78c4be Mon Sep 17 00:00:00 2001
From: Andrew Cantino <cantino@gmail.com>
Date: Sun, 17 Mar 2013 15:56:02 -0700
Subject: [PATCH] Add JSONPath for hash paths and add JSON parsing to the
 WebsiteAgent.

---
 Gemfile                                       |  1 +
 Gemfile.lock                                  |  3 +
 app/models/agent.rb                           | 22 +-----
 app/models/agents/peak_detector_agent.rb      |  6 +-
 app/models/agents/trigger_agent.rb            | 11 +--
 app/models/agents/website_agent.rb            | 75 ++++++++++++++----
 config/initializers/multi_xml_patch.rb        |  2 +-
 lib/utils.rb                                  | 10 +++
 spec/lib/utils_spec.rb                        | 22 ++++++
 .../models/agents/peak_detector_agent_spec.rb |  8 +-
 spec/models/agents/trigger_agent_spec.rb      |  7 --
 spec/models/agents/website_agent_spec.rb      | 79 +++++++++++++++++++
 12 files changed, 185 insertions(+), 61 deletions(-)
 create mode 100644 spec/lib/utils_spec.rb

diff --git a/Gemfile b/Gemfile
index ec4a0079..2c0f40be 100644
--- a/Gemfile
+++ b/Gemfile
@@ -8,6 +8,7 @@ gem 'kaminari'
 gem 'bootstrap-kaminari-views'
 gem "rufus-scheduler", :require => false
 gem 'json', '>= 1.7.7'
+gem 'jsonpath'
 
 gem 'delayed_job', :git => 'https://github.com/wok/delayed_job' # Until the YAML issues are fixed in master.
 gem 'delayed_job_active_record', "~> 0.3.3" # newer was giving a strange MySQL error
diff --git a/Gemfile.lock b/Gemfile.lock
index db205f81..fd77c928 100644
--- a/Gemfile.lock
+++ b/Gemfile.lock
@@ -115,6 +115,8 @@ GEM
       jquery-rails
       railties (>= 3.1.0)
     json (1.7.7)
+    jsonpath (0.5.1)
+      multi_json
     kaminari (0.14.1)
       actionpack (>= 3.0.0)
       activesupport (>= 3.0.0)
@@ -275,6 +277,7 @@ DEPENDENCIES
   geokit-rails3
   jquery-rails
   json (>= 1.7.7)
+  jsonpath
   kaminari
   kramdown
   mysql2
diff --git a/app/models/agent.rb b/app/models/agent.rb
index 3bc4651a..abbd46c6 100644
--- a/app/models/agent.rb
+++ b/app/models/agent.rb
@@ -85,27 +85,7 @@ class Agent < ActiveRecord::Base
   end
 
   def make_message(payload, message = options[:message])
-    message.gsub(/<([^>]+)>/) { value_at(payload, $1) || "??" }
-  end
-
-  def value_at(data, path)
-    if data.is_a?(Hash)
-      path.split(".").inject(data) { |memo, segment|
-        if memo
-          if memo[segment]
-            memo[segment]
-          elsif memo[segment.to_sym]
-            memo[segment.to_sym]
-          else
-            nil
-          end
-        else
-          nil
-        end
-      }.to_s
-    else
-      data
-    end
+    message.gsub(/<([^>]+)>/) { Utils.value_at(payload, $1) || "??" }
   end
 
   def set_default_schedule
diff --git a/app/models/agents/peak_detector_agent.rb b/app/models/agents/peak_detector_agent.rb
index 857cc3dd..be3ab37b 100644
--- a/app/models/agents/peak_detector_agent.rb
+++ b/app/models/agents/peak_detector_agent.rb
@@ -7,7 +7,7 @@ module Agents
     description <<-MD
       Use a PeakDetectorAgent to watch for peaks in an event stream.  When a peak is detected, the resulting Event will have a payload message of `message`.  You can include extractions in the message, for example: `I saw a bar of: <foo.bar>`
 
-      The `value_path` value is a hash path to the value of interest.  `group_by_path` is a hash path that will be used to group values, if present.
+      The `value_path` value is a [JSONPaths](http://goessner.net/articles/JsonPath/) to the value of interest.  `group_by_path` is a hash path that will be used to group values, if present.
 
       Set `expected_receive_period_in_days` to the maximum amount of time that you'd expect to pass between Events being received by this Agent.
 
@@ -106,13 +106,13 @@ module Agents
     end
 
     def group_for(event)
-      ((options[:group_by_path].present? && value_at(event.payload, options[:group_by_path])) || 'no_group').to_sym
+      ((options[:group_by_path].present? && Utils.value_at(event.payload, options[:group_by_path])) || 'no_group').to_sym
     end
 
     def remember(group, event)
       memory[:data] ||= {}
       memory[:data][group] ||= []
-      memory[:data][group] << [value_at(event.payload, options[:value_path]), event.created_at.to_i]
+      memory[:data][group] << [Utils.value_at(event.payload, options[:value_path]), event.created_at.to_i]
       cleanup group
     end
 
diff --git a/app/models/agents/trigger_agent.rb b/app/models/agents/trigger_agent.rb
index 0f2c49ee..9387f38c 100644
--- a/app/models/agents/trigger_agent.rb
+++ b/app/models/agents/trigger_agent.rb
@@ -7,14 +7,7 @@ module Agents
     description <<-MD
       Use a TriggerAgent to watch for a specific value in an Event payload.
 
-      The `rules` array contains hashes of `path`, `value`, and `type`.  The `path` value is a dotted path through a hash, for example `foo.bar` would return `hello` from this structure:
-
-          {
-            :foo => {
-              :bar => "hello"
-            },
-            :something => "else"
-          }
+      The `rules` array contains hashes of `path`, `value`, and `type`.  The `path` value is a dotted path through a hash in [JSONPaths](http://goessner.net/articles/JsonPath/) syntax.
 
       The `type` can be one of #{VALID_COMPARISON_TYPES.map { |t| "`#{t}`" }.to_sentence} and compares with the `value`.
 
@@ -55,7 +48,7 @@ module Agents
     def receive(incoming_events)
       incoming_events.each do |event|
         match = options[:rules].all? do |rule|
-          value_at_path = value_at(event[:payload], rule[:path])
+          value_at_path = Utils.value_at(event[:payload], rule[:path])
           case rule[:type]
             when "regex"
               value_at_path.to_s =~ Regexp.new(rule[:value], Regexp::IGNORECASE)
diff --git a/app/models/agents/website_agent.rb b/app/models/agents/website_agent.rb
index 1e8d8d0d..f8eb472f 100644
--- a/app/models/agents/website_agent.rb
+++ b/app/models/agents/website_agent.rb
@@ -7,12 +7,15 @@ module Agents
     cannot_receive_events!
 
     description <<-MD
-      The WebsiteAgent scrapes a website and creates Events based on any changes in the results.
+      The WebsiteAgent scrapes a website, XML document, or JSON feed and creates Events based on the results.
 
-      Specify the website's `url` and select a `mode` for when to create Events based on the scraped data, either `all` or `on_change`.
+      Specify a `url` and select a `mode` for when to create Events based on the scraped data, either `all` or `on_change`.
 
-      To tell the Agent how to scrape the site, specify `extract` as a hash with keys naming the extractions and values of hashes.
-      These subhashes specify how to extract with a `:css` CSS selector and either `:text => true` or `attr` pointing to an attribute name to grab.  An example:
+      The `type` value can be `xml`, `html`, or `json`.
+
+      To tell the Agent how to parse the content, specify `extract` as a hash with keys naming the extractions and values of hashes.
+
+      When parsing HTML or XML, these sub-hashes specify how to extract with a `:css` CSS selector and either `:text => true` or `attr` pointing to an attribute name to grab.  An example:
 
           :extract => {
             :url => { :css => "#comic img", :attr => "src" },
@@ -20,12 +23,20 @@ module Agents
             :body_text => { :css => "div.main", :text => true }
           }
 
-      Note that whatever you extract MUST have the same number of matches for each extractor.  E.g., if you're extracting rows, all extractors must match all rows.
+      When parsing JSON, these sub-hashes specify [JSONPaths](http://goessner.net/articles/JsonPath/) to the values that you care about.  For example:
+
+          :extract => {
+            :title => { :path => "results.data[*].title" },
+            :description => { :path => "results.data[*].description" }
+          }
+
+      Note that for all of the formats, whatever you extract MUST have the same number of matches for each extractor.  E.g., if you're extracting rows, all extractors must match all rows.
 
       Set `expected_update_period_in_days` to the maximum amount of time that you'd expect to pass between Events being created by this Agent.
     MD
 
-    event_description do <<-MD
+    event_description do
+      <<-MD
       Events will have the fields you specified.  Your options look like:
 
           #{PP.pp(options[:extract], "")}
@@ -44,6 +55,7 @@ module Agents
       {
           :expected_update_period_in_days => "2",
           :url => "http://xkcd.com",
+          :type => "html",
           :mode => :on_change,
           :extract => {
               :url => {:css => "#comic img", :attr => "src"},
@@ -60,18 +72,22 @@ module Agents
       hydra = Typhoeus::Hydra.new
       request = Typhoeus::Request.new(options[:url], :followlocation => true)
       request.on_complete do |response|
-        doc = (options[:type].to_s == "xml" || options[:url] =~ /\.(rss|xml)$/i) ? Nokogiri::XML(response.body) : Nokogiri::HTML(response.body)
+        doc = parse(response.body)
         output = {}
         options[:extract].each do |name, extraction_details|
-          output[name] = doc.css(extraction_details[:css]).map { |node|
-            if extraction_details[:attr]
-              node.attr(extraction_details[:attr])
-            elsif extraction_details[:text]
-              node.text()
-            else
-              raise StandardError, ":attr or :text is required on each of the extraction patterns."
-            end
-          }
+          if extraction_type == "json"
+            output[name] = Utils.values_at(doc, extraction_details[:path])
+          else
+            output[name] = doc.css(extraction_details[:css]).map { |node|
+              if extraction_details[:attr]
+                node.attr(extraction_details[:attr])
+              elsif extraction_details[:text]
+                node.text()
+              else
+                raise StandardError, ":attr or :text is required on HTML or XML extraction patterns"
+              end
+            }
+          end
         end
 
         num_unique_lengths = options[:extract].keys.map { |name| output[name].length }.uniq
@@ -94,5 +110,32 @@ module Agents
       hydra.queue request
       hydra.run
     end
+
+    private
+
+    def extraction_type
+      (options[:type] || begin
+        if options[:url] =~ /\.(rss|xml)$/i
+          "xml"
+        elsif options[:url] =~ /\.json$/i
+          "json"
+        else
+          "html"
+        end
+      end).to_s
+    end
+
+    def parse(data)
+      case extraction_type
+        when "xml"
+          Nokogiri::XML(data)
+        when "json"
+          JSON.parse(data)
+        when "html"
+          Nokogiri::HTML(data)
+        else
+          raise "Unknown extraction type #{extraction_type}"
+      end
+    end
   end
 end
\ No newline at end of file
diff --git a/config/initializers/multi_xml_patch.rb b/config/initializers/multi_xml_patch.rb
index 5f5f54be..2f178313 100644
--- a/config/initializers/multi_xml_patch.rb
+++ b/config/initializers/multi_xml_patch.rb
@@ -15,7 +15,7 @@ module MultiXml
     end
   end
 
-  DISALLOWED_XML_TYPES = %w(symbol yaml)
+  DISALLOWED_XML_TYPES = %w(symbol yaml) unless defined?(DISALLOWED_XML_TYPES)
 
   class << self
     def parse(xml, options={})
diff --git a/lib/utils.rb b/lib/utils.rb
index 6ca0f0ea..07ac4d6b 100644
--- a/lib/utils.rb
+++ b/lib/utils.rb
@@ -1,3 +1,5 @@
+require 'jsonpath'
+
 module Utils
   # Unindents if the indentation is 2 or more characters.
   def self.unindent(s)
@@ -14,4 +16,12 @@ module Utils
         object
     end
   end
+
+  def self.value_at(data, path)
+    values_at(data, path).first
+  end
+
+  def self.values_at(data, path)
+    JsonPath.new(path).on(data.is_a?(String) ? data : data.to_json)
+  end
 end
\ No newline at end of file
diff --git a/spec/lib/utils_spec.rb b/spec/lib/utils_spec.rb
new file mode 100644
index 00000000..847be971
--- /dev/null
+++ b/spec/lib/utils_spec.rb
@@ -0,0 +1,22 @@
+require 'spec_helper'
+
+describe Utils do
+  describe "#value_at" do
+    it "returns the value at a JSON path" do
+      Utils.value_at({ :foo => { :bar => :baz }}.to_json, "foo.bar").should == "baz"
+      Utils.value_at({ :foo => { :bar => { :bing => 2 } }}, "foo.bar.bing").should == 2
+    end
+
+    it "returns nil when the path cannot be followed" do
+      Utils.value_at({ :foo => { :bar => :baz }}, "foo.bing").should be_nil
+    end
+  end
+
+  describe "#values_at" do
+    it "returns arrays of matching values" do
+      Utils.values_at({ :foo => { :bar => :baz }}, "foo.bar").should == %w[baz]
+      Utils.values_at({ :foo => [ { :bar => :baz }, { :bar => :bing } ]}, "foo[*].bar").should == %w[baz bing]
+      Utils.values_at({ :foo => [ { :bar => :baz }, { :bar => :bing } ]}, "foo[*].bar").should == %w[baz bing]
+    end
+  end
+end
\ No newline at end of file
diff --git a/spec/models/agents/peak_detector_agent_spec.rb b/spec/models/agents/peak_detector_agent_spec.rb
index d7ee17de..3bf69f84 100644
--- a/spec/models/agents/peak_detector_agent_spec.rb
+++ b/spec/models/agents/peak_detector_agent_spec.rb
@@ -22,9 +22,9 @@ describe Agents::PeakDetectorAgent do
       events = build_events(:keys => [:count, :filter],
                             :values => [[1, "something"], [2, "something"], [3, "else"]])
       @agent.receive events
-      @agent.memory[:data][:something].map(&:first).should == %w[1 2]
+      @agent.memory[:data][:something].map(&:first).should == [1, 2]
       @agent.memory[:data][:something].last.last.should be_within(10).of((100 - 1).hours.ago.to_i)
-      @agent.memory[:data][:else].first.first.should == "3"
+      @agent.memory[:data][:else].first.first.should == 3
       @agent.memory[:data][:else].first.last.should be_within(10).of((100 - 2).hours.ago.to_i)
     end
 
@@ -32,7 +32,7 @@ describe Agents::PeakDetectorAgent do
       @agent.options[:group_by_path] = ""
       events = build_events(:keys => [:count], :values => [[1], [2]])
       @agent.receive events
-      @agent.memory[:data][:no_group].map(&:first).should == %w[1 2]
+      @agent.memory[:data][:no_group].map(&:first).should == [1, 2]
     end
 
     it "keeps a rolling window of data" do
@@ -40,7 +40,7 @@ describe Agents::PeakDetectorAgent do
       @agent.receive build_events(:keys => [:count],
                                   :values => [1, 2, 3, 4, 5, 6, 7, 8].map {|i| [i]},
                                   :pattern => { :filter => "something" })
-      @agent.memory[:data][:something].map(&:first).should == %w[4 5 6 7 8]
+      @agent.memory[:data][:something].map(&:first).should == [4, 5, 6, 7, 8]
     end
 
     it "finds peaks" do
diff --git a/spec/models/agents/trigger_agent_spec.rb b/spec/models/agents/trigger_agent_spec.rb
index 83b8a570..aeb61859 100644
--- a/spec/models/agents/trigger_agent_spec.rb
+++ b/spec/models/agents/trigger_agent_spec.rb
@@ -120,13 +120,6 @@ describe Agents::TriggerAgent do
         @checker.receive([@event])
       }.should_not change { Event.count }
 
-
-      @event.payload = "world"
-      @checker.options[:rules].first[:path] = "anything"
-      lambda {
-        @checker.receive([@event])
-      }.should change { Event.count }.by(1)
-
       @checker.options[:rules].first[:value] = "hi"
       lambda {
         @checker.receive([@event])
diff --git a/spec/models/agents/website_agent_spec.rb b/spec/models/agents/website_agent_spec.rb
index 54306d9d..eadefcc5 100644
--- a/spec/models/agents/website_agent_spec.rb
+++ b/spec/models/agents/website_agent_spec.rb
@@ -6,6 +6,7 @@ describe Agents::WebsiteAgent do
     @site = {
         :name => "XKCD",
         :expected_update_period_in_days => 2,
+        :type => "html",
         :url => "http://xkcd.com",
         :mode => :on_change,
         :extract => {
@@ -41,4 +42,82 @@ describe Agents::WebsiteAgent do
       }.should raise_error(StandardError, /Got an uneven number of matches/)
     end
   end
+
+  describe "parsing" do
+    it "parses CSS" do
+      @checker.check
+      event = Event.last
+      event.payload[:url].should == "http://imgs.xkcd.com/comics/evolving.png"
+      event.payload[:title].should =~ /^Biologists play reverse/
+    end
+
+    describe "JSON" do
+      it "works with paths" do
+        json = {
+            :response => {
+                :version => 2,
+                :title => "hello!"
+            }
+        }
+        stub_request(:any, /json-site/).to_return(:body => json.to_json, :status => 200)
+        site = {
+            :name => "Some JSON Response",
+            :expected_update_period_in_days => 2,
+            :type => "json",
+            :url => "http://json-site.com",
+            :mode => :on_change,
+            :extract => {
+                :version => { :path => "response.version" },
+                :title => { :path => "response.title" }
+            }
+        }
+        checker = Agents::WebsiteAgent.new(:name => "Weather Site", :options => site)
+        checker.user = users(:bob)
+        checker.save!
+
+        checker.check
+        event = Event.last
+        event.payload[:version].should == 2
+        event.payload[:title].should == "hello!"
+      end
+
+      it "can handle arrays" do
+        json = {
+            :response => {
+                :data => [
+                    { :title => "first", :version => 2 },
+                    { :title => "second", :version => 2.5 }
+                ]
+            }
+        }
+        stub_request(:any, /json-site/).to_return(:body => json.to_json, :status => 200)
+        site = {
+            :name => "Some JSON Response",
+            :expected_update_period_in_days => 2,
+            :type => "json",
+            :url => "http://json-site.com",
+            :mode => :on_change,
+            :extract => {
+                :title => { :path => "response.data[*].title" },
+                :version => { :path => "response.data[*].version" }
+            }
+        }
+        checker = Agents::WebsiteAgent.new(:name => "Weather Site", :options => site)
+        checker.user = users(:bob)
+        checker.save!
+
+        lambda {
+          checker.check
+        }.should change { Event.count }.by(2)
+
+        event = Event.all[-1]
+        event.payload[:version].should == 2.5
+        event.payload[:title].should == "second"
+
+        event = Event.all[-2]
+        event.payload[:version].should == 2
+        event.payload[:title].should == "first"
+      end
+    end
+  end
 end
\ No newline at end of file