From fe35df87522247bc646753dd59520aa58c9b3422 Mon Sep 17 00:00:00 2001 From: Akinori MUSHA Date: Mon, 17 Oct 2016 21:33:50 +0900 Subject: [PATCH] Add a new option `template` to WebsiteAgent If given, it is used as a Liquid template for each event created by the Agent, instead of directly emitting the results of extraction as events. An existing spec needs to be fixed because WebsiteAgent now has the `template` option, which may not be a hash of hashes. --- app/models/agents/website_agent.rb | 58 +++++++++++++++---- spec/models/agents/website_agent_spec.rb | 16 +++++ .../agent_controller_concern.rb | 5 +- 3 files changed, 67 insertions(+), 12 deletions(-) diff --git a/app/models/agents/website_agent.rb b/app/models/agents/website_agent.rb index bbb1a482..db787d77 100644 --- a/app/models/agents/website_agent.rb +++ b/app/models/agents/website_agent.rb @@ -111,6 +111,17 @@ module Agents Set `http_success_codes` to an array of status codes (e.g., `[404, 422]`) to treat HTTP response codes beyond 200 as successes. + If a `template` option is given, it is used as a Liquid template for each event created by this Agent, instead of directly emitting the results of extraction as events. In the template, keys of extracted data can be interpolated, and some additional variables are also available as explained in the next section. For example: + + "template": { + "url": "{{ url }}", + "title": "{{ title }}", + "description": "{{ body_text }}", + "last_modified": "{{ _response_.headers.Last-Modified | date: '%FT%T' }}" + } + + In the `on_change` mode, change is detected based on the resulted event payload after applying this option. If you want to add some keys to each event but ignore any change in them, set `mode` to `all` and put a DeDuplicationAgent downstream. + # Liquid Templating In Liquid templating, the following variable is available: @@ -127,8 +138,10 @@ module Agents MD event_description do + keys = options['template'].presence || options['extract'].keys + "Events will have the following fields:\n\n %s" % [ - Utils.pretty_print(Hash[options['extract'].keys.map { |key| + Utils.pretty_print(Hash[keys.map { |key| [key, "..."] }]) ] @@ -157,6 +170,7 @@ module Agents errors.add(:base, "either url, url_from_event, or data_from_event are required") unless options['url'].present? || options['url_from_event'].present? || options['data_from_event'].present? errors.add(:base, "expected_update_period_in_days is required") unless options['expected_update_period_in_days'].present? validate_extract_options! + validate_template_options! validate_http_success_codes! # Check for optional fields @@ -281,6 +295,15 @@ module Agents end end + def validate_template_options! + template = options['template'].presence or return + + unless Hash === template && + template.each_pair.all? { |key, value| String === value } + errors.add(:base, 'template must be a hash of strings.') + end + end + def check check_urls(interpolated['url']) end @@ -343,20 +366,33 @@ module Agents extract_xml(doc) end - num_unique_lengths = interpolated['extract'].keys.map { |name| output[name].length }.uniq - - if num_unique_lengths.length != 1 + if output.each_value.each_cons(2).any? { |m, n| m.size != n.size } raise "Got an uneven number of matches for #{interpolated['name']}: #{interpolated['extract'].inspect}" end - old_events = previous_payloads num_unique_lengths.first - num_unique_lengths.first.times do |index| - result = {} - interpolated['extract'].keys.each do |name| - result[name] = output[name][index] - if name.to_s == 'url' && url.present? - result[name] = (url + Utils.normalize_uri(result[name])).to_s + num_tuples = output.each_value.first.size + + old_events = previous_payloads num_tuples + + template = options['template'].presence + + num_tuples.times do |index| + extracted = {} + interpolated['extract'].each_key do |name| + extracted[name] = output[name][index] + end + + result = + if template + interpolate_with(extracted) do + interpolate_options(template) + end + else + extracted end + + if payload_url = result['url'].presence + result['url'] = (url + Utils.normalize_uri(payload_url)).to_s end if store_payload!(old_events, result) diff --git a/spec/models/agents/website_agent_spec.rb b/spec/models/agents/website_agent_spec.rb index 9315a4e1..9cf125b0 100644 --- a/spec/models/agents/website_agent_spec.rb +++ b/spec/models/agents/website_agent_spec.rb @@ -739,6 +739,22 @@ describe Agents::WebsiteAgent do expect(event.payload['response_info']).to eq('The reponse was 200 OK.') end + it "should be formatted by template after extraction" do + @valid_options['template'] = { + 'url' => '{{url}}', + 'title' => '{{title | upcase}}', + 'summary' => '{{title}}: {{hovertext | truncate: 20}}', + } + @checker.options = @valid_options + @checker.check + event = Event.last + expect(event.payload).to eq({ + 'title' => 'EVOLVING', + 'url' => 'http://imgs.xkcd.com/comics/evolving.png', + 'summary' => 'Evolving: Biologists play r...', + }) + end + describe "XML" do before do stub_request(:any, /github_rss/).to_return( diff --git a/spec/support/shared_examples/agent_controller_concern.rb b/spec/support/shared_examples/agent_controller_concern.rb index cd08a893..d2c7d383 100644 --- a/spec/support/shared_examples/agent_controller_concern.rb +++ b/spec/support/shared_examples/agent_controller_concern.rb @@ -130,7 +130,10 @@ shared_examples_for AgentControllerConcern do end it "should configure targets with nested objects" do - agent.control_targets << agents(:bob_data_output_agent) + agent.control_targets = [ + agents(:bob_basecamp_agent), # does not support a `template` option, but anyway + agents(:bob_data_output_agent) + ] agent.options['action'] = 'configure' agent.options['configure_options'] = { template: {