Add a new option template to WebsiteAgent

If given, it is used as a Liquid template for each event created by the
Agent, instead of directly emitting the results of extraction as events.

An existing spec needs to be fixed because WebsiteAgent now has the
`template` option, which may not be a hash of hashes.
This commit is contained in:
Akinori MUSHA 2016-10-17 21:33:50 +09:00
parent 9a3290ef40
commit fe35df8752
3 changed files with 67 additions and 12 deletions

View file

@ -111,6 +111,17 @@ module Agents
Set `http_success_codes` to an array of status codes (e.g., `[404, 422]`) to treat HTTP response codes beyond 200 as successes.
If a `template` option is given, it is used as a Liquid template for each event created by this Agent, instead of directly emitting the results of extraction as events. In the template, keys of extracted data can be interpolated, and some additional variables are also available as explained in the next section. For example:
"template": {
"url": "{{ url }}",
"title": "{{ title }}",
"description": "{{ body_text }}",
"last_modified": "{{ _response_.headers.Last-Modified | date: '%FT%T' }}"
}
In the `on_change` mode, change is detected based on the resulted event payload after applying this option. If you want to add some keys to each event but ignore any change in them, set `mode` to `all` and put a DeDuplicationAgent downstream.
# Liquid Templating
In Liquid templating, the following variable is available:
@ -127,8 +138,10 @@ module Agents
MD
event_description do
keys = options['template'].presence || options['extract'].keys
"Events will have the following fields:\n\n %s" % [
Utils.pretty_print(Hash[options['extract'].keys.map { |key|
Utils.pretty_print(Hash[keys.map { |key|
[key, "..."]
}])
]
@ -157,6 +170,7 @@ module Agents
errors.add(:base, "either url, url_from_event, or data_from_event are required") unless options['url'].present? || options['url_from_event'].present? || options['data_from_event'].present?
errors.add(:base, "expected_update_period_in_days is required") unless options['expected_update_period_in_days'].present?
validate_extract_options!
validate_template_options!
validate_http_success_codes!
# Check for optional fields
@ -281,6 +295,15 @@ module Agents
end
end
def validate_template_options!
template = options['template'].presence or return
unless Hash === template &&
template.each_pair.all? { |key, value| String === value }
errors.add(:base, 'template must be a hash of strings.')
end
end
def check
check_urls(interpolated['url'])
end
@ -343,20 +366,33 @@ module Agents
extract_xml(doc)
end
num_unique_lengths = interpolated['extract'].keys.map { |name| output[name].length }.uniq
if num_unique_lengths.length != 1
if output.each_value.each_cons(2).any? { |m, n| m.size != n.size }
raise "Got an uneven number of matches for #{interpolated['name']}: #{interpolated['extract'].inspect}"
end
old_events = previous_payloads num_unique_lengths.first
num_unique_lengths.first.times do |index|
result = {}
interpolated['extract'].keys.each do |name|
result[name] = output[name][index]
if name.to_s == 'url' && url.present?
result[name] = (url + Utils.normalize_uri(result[name])).to_s
num_tuples = output.each_value.first.size
old_events = previous_payloads num_tuples
template = options['template'].presence
num_tuples.times do |index|
extracted = {}
interpolated['extract'].each_key do |name|
extracted[name] = output[name][index]
end
result =
if template
interpolate_with(extracted) do
interpolate_options(template)
end
else
extracted
end
if payload_url = result['url'].presence
result['url'] = (url + Utils.normalize_uri(payload_url)).to_s
end
if store_payload!(old_events, result)

View file

@ -739,6 +739,22 @@ describe Agents::WebsiteAgent do
expect(event.payload['response_info']).to eq('The reponse was 200 OK.')
end
it "should be formatted by template after extraction" do
@valid_options['template'] = {
'url' => '{{url}}',
'title' => '{{title | upcase}}',
'summary' => '{{title}}: {{hovertext | truncate: 20}}',
}
@checker.options = @valid_options
@checker.check
event = Event.last
expect(event.payload).to eq({
'title' => 'EVOLVING',
'url' => 'http://imgs.xkcd.com/comics/evolving.png',
'summary' => 'Evolving: Biologists play r...',
})
end
describe "XML" do
before do
stub_request(:any, /github_rss/).to_return(

View file

@ -130,7 +130,10 @@ shared_examples_for AgentControllerConcern do
end
it "should configure targets with nested objects" do
agent.control_targets << agents(:bob_data_output_agent)
agent.control_targets = [
agents(:bob_basecamp_agent), # does not support a `template` option, but anyway
agents(:bob_data_output_agent)
]
agent.options['action'] = 'configure'
agent.options['configure_options'] = {
template: {