diff --git a/CHANGES.md b/CHANGES.md index 2c170157..8aca9838 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,5 +1,6 @@ # Changes +* Jun 19, 2015 - Add `url_from_event` to WebsiteAgent. * Jun 17, 2015 - RssAgent emits events for new feed items in chronological order. * Jun 15, 2015 - Liquid filter `uri_expand` added. * Jun 12, 2015 - RSSAgent can now accept an array of URLs. diff --git a/app/models/agents/website_agent.rb b/app/models/agents/website_agent.rb index 9e9e7db6..a97b16d4 100644 --- a/app/models/agents/website_agent.rb +++ b/app/models/agents/website_agent.rb @@ -19,7 +19,7 @@ module Agents `url` can be a single url, or an array of urls (for example, for multiple pages with the exact same structure but different content to scrape) - The WebsiteAgent can also scrape based on incoming events. It will scrape the url contained in the `url` key of the incoming event payload. If you specify `merge` as the `mode`, it will retain the old payload and update it with the new values. + The WebsiteAgent can also scrape based on incoming events. It will scrape the url contained in the `url` key of the incoming event payload, or if you set `url_from_event` it is used as a Liquid template to generate the url to access. If you specify `merge` as the `mode`, it will retain the old payload and update it with the new values. # Supported Document Types @@ -135,7 +135,8 @@ module Agents def validate_options # Check for required fields - errors.add(:base, "url and expected_update_period_in_days are required") unless options['expected_update_period_in_days'].present? && options['url'].present? + errors.add(:base, "either url or url_from_event is required") unless options['url'].present? || options['url_from_event'].present? + errors.add(:base, "expected_update_period_in_days is required") unless options['expected_update_period_in_days'].present? if !options['extract'].present? && extraction_type != "json" errors.add(:base, "extract is required for all types except json") end @@ -257,7 +258,12 @@ module Agents def receive(incoming_events) incoming_events.each do |event| interpolate_with(event) do - url_to_scrape = event.payload['url'] + url_to_scrape = + if url_template = options['url_from_event'].presence + interpolate_string(url_template) + else + event.payload['url'] + end check_url(url_to_scrape, interpolated['mode'].to_s == "merge" ? event.payload : {}) end diff --git a/spec/models/agents/website_agent_spec.rb b/spec/models/agents/website_agent_spec.rb index 21ff2bf7..077fba8d 100644 --- a/spec/models/agents/website_agent_spec.rb +++ b/spec/models/agents/website_agent_spec.rb @@ -633,6 +633,17 @@ fire: hot }.to change { Event.count }.by(1) end + it "should use url_from_event as url to scrape if it exists when receiving an event" do + stub = stub_request(:any, 'http://example.org/?url=http%3A%2F%2Fxkcd.com') + + @checker.options = @valid_options.merge( + 'url_from_event' => 'http://example.org/?url={{url | uri_escape}}' + ) + @checker.receive([@event]) + + expect(stub).to have_been_requested + end + it "should interpolate values from incoming event payload" do expect { @valid_options['extract'] = {