From fe35df87522247bc646753dd59520aa58c9b3422 Mon Sep 17 00:00:00 2001 From: Akinori MUSHA Date: Mon, 17 Oct 2016 21:33:50 +0900 Subject: [PATCH 1/3] Add a new option `template` to WebsiteAgent If given, it is used as a Liquid template for each event created by the Agent, instead of directly emitting the results of extraction as events. An existing spec needs to be fixed because WebsiteAgent now has the `template` option, which may not be a hash of hashes. --- app/models/agents/website_agent.rb | 58 +++++++++++++++---- spec/models/agents/website_agent_spec.rb | 16 +++++ .../agent_controller_concern.rb | 5 +- 3 files changed, 67 insertions(+), 12 deletions(-) diff --git a/app/models/agents/website_agent.rb b/app/models/agents/website_agent.rb index bbb1a482..db787d77 100644 --- a/app/models/agents/website_agent.rb +++ b/app/models/agents/website_agent.rb @@ -111,6 +111,17 @@ module Agents Set `http_success_codes` to an array of status codes (e.g., `[404, 422]`) to treat HTTP response codes beyond 200 as successes. + If a `template` option is given, it is used as a Liquid template for each event created by this Agent, instead of directly emitting the results of extraction as events. In the template, keys of extracted data can be interpolated, and some additional variables are also available as explained in the next section. For example: + + "template": { + "url": "{{ url }}", + "title": "{{ title }}", + "description": "{{ body_text }}", + "last_modified": "{{ _response_.headers.Last-Modified | date: '%FT%T' }}" + } + + In the `on_change` mode, change is detected based on the resulted event payload after applying this option. If you want to add some keys to each event but ignore any change in them, set `mode` to `all` and put a DeDuplicationAgent downstream. + # Liquid Templating In Liquid templating, the following variable is available: @@ -127,8 +138,10 @@ module Agents MD event_description do + keys = options['template'].presence || options['extract'].keys + "Events will have the following fields:\n\n %s" % [ - Utils.pretty_print(Hash[options['extract'].keys.map { |key| + Utils.pretty_print(Hash[keys.map { |key| [key, "..."] }]) ] @@ -157,6 +170,7 @@ module Agents errors.add(:base, "either url, url_from_event, or data_from_event are required") unless options['url'].present? || options['url_from_event'].present? || options['data_from_event'].present? errors.add(:base, "expected_update_period_in_days is required") unless options['expected_update_period_in_days'].present? validate_extract_options! + validate_template_options! validate_http_success_codes! # Check for optional fields @@ -281,6 +295,15 @@ module Agents end end + def validate_template_options! + template = options['template'].presence or return + + unless Hash === template && + template.each_pair.all? { |key, value| String === value } + errors.add(:base, 'template must be a hash of strings.') + end + end + def check check_urls(interpolated['url']) end @@ -343,20 +366,33 @@ module Agents extract_xml(doc) end - num_unique_lengths = interpolated['extract'].keys.map { |name| output[name].length }.uniq - - if num_unique_lengths.length != 1 + if output.each_value.each_cons(2).any? { |m, n| m.size != n.size } raise "Got an uneven number of matches for #{interpolated['name']}: #{interpolated['extract'].inspect}" end - old_events = previous_payloads num_unique_lengths.first - num_unique_lengths.first.times do |index| - result = {} - interpolated['extract'].keys.each do |name| - result[name] = output[name][index] - if name.to_s == 'url' && url.present? - result[name] = (url + Utils.normalize_uri(result[name])).to_s + num_tuples = output.each_value.first.size + + old_events = previous_payloads num_tuples + + template = options['template'].presence + + num_tuples.times do |index| + extracted = {} + interpolated['extract'].each_key do |name| + extracted[name] = output[name][index] + end + + result = + if template + interpolate_with(extracted) do + interpolate_options(template) + end + else + extracted end + + if payload_url = result['url'].presence + result['url'] = (url + Utils.normalize_uri(payload_url)).to_s end if store_payload!(old_events, result) diff --git a/spec/models/agents/website_agent_spec.rb b/spec/models/agents/website_agent_spec.rb index 9315a4e1..9cf125b0 100644 --- a/spec/models/agents/website_agent_spec.rb +++ b/spec/models/agents/website_agent_spec.rb @@ -739,6 +739,22 @@ describe Agents::WebsiteAgent do expect(event.payload['response_info']).to eq('The reponse was 200 OK.') end + it "should be formatted by template after extraction" do + @valid_options['template'] = { + 'url' => '{{url}}', + 'title' => '{{title | upcase}}', + 'summary' => '{{title}}: {{hovertext | truncate: 20}}', + } + @checker.options = @valid_options + @checker.check + event = Event.last + expect(event.payload).to eq({ + 'title' => 'EVOLVING', + 'url' => 'http://imgs.xkcd.com/comics/evolving.png', + 'summary' => 'Evolving: Biologists play r...', + }) + end + describe "XML" do before do stub_request(:any, /github_rss/).to_return( diff --git a/spec/support/shared_examples/agent_controller_concern.rb b/spec/support/shared_examples/agent_controller_concern.rb index cd08a893..d2c7d383 100644 --- a/spec/support/shared_examples/agent_controller_concern.rb +++ b/spec/support/shared_examples/agent_controller_concern.rb @@ -130,7 +130,10 @@ shared_examples_for AgentControllerConcern do end it "should configure targets with nested objects" do - agent.control_targets << agents(:bob_data_output_agent) + agent.control_targets = [ + agents(:bob_basecamp_agent), # does not support a `template` option, but anyway + agents(:bob_data_output_agent) + ] agent.options['action'] = 'configure' agent.options['configure_options'] = { template: { From 8b897f5da34311a2afcdc72ef8dcef35cb5ca0c7 Mon Sep 17 00:00:00 2001 From: Akinori MUSHA Date: Mon, 17 Oct 2016 20:45:56 +0900 Subject: [PATCH 2/3] Add Liquid variables `_response_.url` and `_url_` to WebsiteAgent --- app/models/agents/website_agent.rb | 12 +++++++++++- spec/models/agents/website_agent_spec.rb | 14 ++++++++++++-- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/app/models/agents/website_agent.rb b/app/models/agents/website_agent.rb index db787d77..bdb5d6e9 100644 --- a/app/models/agents/website_agent.rb +++ b/app/models/agents/website_agent.rb @@ -124,7 +124,9 @@ module Agents # Liquid Templating - In Liquid templating, the following variable is available: + In Liquid templating, the following variables are available except when invoked by `data_from_event`: + + * `_url_`: The URL specified to fetch the content from. * `_response_`: A response object with the following keys: @@ -132,6 +134,8 @@ module Agents * `headers`: Response headers; for example, `{{ _response_.headers.Content-Type }}` expands to the value of the Content-Type header. Keys are insensitive to cases and -/_. + * `url`: The final URL of the fetched page, following redirects. + # Ordering Events #{description_events_order} @@ -328,6 +332,7 @@ module Agents raise "Failed: #{response.inspect}" unless consider_response_successful?(response) interpolation_context.stack { + interpolation_context['_url_'] = uri.to_s interpolation_context['_response_'] = ResponseDrop.new(response) handle_data(response.body, response.env[:url], existing_payload) } @@ -603,6 +608,11 @@ module Agents def status @object.status end + + # The URL + def url + @object.env.url.to_s + end end # Wraps Faraday::Utils::Headers diff --git a/spec/models/agents/website_agent_spec.rb b/spec/models/agents/website_agent_spec.rb index 9cf125b0..363b05a5 100644 --- a/spec/models/agents/website_agent_spec.rb +++ b/spec/models/agents/website_agent_spec.rb @@ -8,6 +8,10 @@ describe Agents::WebsiteAgent do headers: { 'X-Status-Message' => 'OK' }) + stub_request(:any, /xkcd\.com\/index$/).to_return(status: 301, + headers: { + 'Location' => 'http://xkcd.com/' + }) @valid_options = { 'name' => "XKCD", 'expected_update_period_in_days' => "2", @@ -729,14 +733,20 @@ describe Agents::WebsiteAgent do end it "should interpolate _response_" do + @valid_options['url'] = 'http://xkcd.com/index' @valid_options['extract']['response_info'] = @valid_options['extract']['url'].merge( - 'value' => '"{{ "The reponse was " | append:_response_.status | append:" " | append:_response_.headers.X-Status-Message | append:"." }}"' + 'value' => '{{ "The reponse from " | append:_response_.url | append:" was " | append:_response_.status | append:" " | append:_response_.headers.X-Status-Message | append:"." | to_xpath }}' + ) + @valid_options['extract']['original_url'] = + @valid_options['extract']['url'].merge( + 'value' => '{{ _url_ | to_xpath }}' ) @checker.options = @valid_options @checker.check event = Event.last - expect(event.payload['response_info']).to eq('The reponse was 200 OK.') + expect(event.payload['response_info']).to eq('The reponse from http://xkcd.com/ was 200 OK.') + expect(event.payload['original_url']).to eq('http://xkcd.com/index') end it "should be formatted by template after extraction" do From 58fabb885c4fdc38e68c75f4c47a755c98aba565 Mon Sep 17 00:00:00 2001 From: Akinori MUSHA Date: Wed, 19 Oct 2016 05:14:33 +0900 Subject: [PATCH 3/3] Add a new Liquid filter `rebase_hrefs` --- app/concerns/liquid_interpolatable.rb | 5 ++ app/models/agents/website_agent.rb | 2 +- lib/utils.rb | 89 +++++++++++++++++++++ spec/concerns/liquid_interpolatable_spec.rb | 38 +++++++++ 4 files changed, 133 insertions(+), 1 deletion(-) diff --git a/app/concerns/liquid_interpolatable.rb b/app/concerns/liquid_interpolatable.rb index dde867a6..83d93dec 100644 --- a/app/concerns/liquid_interpolatable.rb +++ b/app/concerns/liquid_interpolatable.rb @@ -189,6 +189,11 @@ module LiquidInterpolatable url end + # Rebase URIs contained in attributes in a given HTML fragment + def rebase_hrefs(input, base_uri) + Utils.rebase_hrefs(input, base_uri) rescue input + end + # Unescape (basic) HTML entities in a string # # This currently decodes the following entities only: "'", diff --git a/app/models/agents/website_agent.rb b/app/models/agents/website_agent.rb index bdb5d6e9..5425f196 100644 --- a/app/models/agents/website_agent.rb +++ b/app/models/agents/website_agent.rb @@ -134,7 +134,7 @@ module Agents * `headers`: Response headers; for example, `{{ _response_.headers.Content-Type }}` expands to the value of the Content-Type header. Keys are insensitive to cases and -/_. - * `url`: The final URL of the fetched page, following redirects. + * `url`: The final URL of the fetched page, following redirects. Using this in the `template` option, you can resolve relative URLs extracted from a document like `{{ link | to_uri: _request_.url }}` and `{{ content | rebase_hrefs: _request_.url }}`. # Ordering Events diff --git a/lib/utils.rb b/lib/utils.rb index c192cea2..478946cd 100644 --- a/lib/utils.rb +++ b/lib/utils.rb @@ -170,4 +170,93 @@ module Utils nil end end + + module HTMLTransformer + SINGLE = 1 + MULTIPLE = 2 + COMMA_SEPARATED = 3 + SRCSET = 4 + + URI_ATTRIBUTES = { + 'a' => { 'href' => SINGLE }, + 'applet' => { 'archive' => COMMA_SEPARATED, 'codebase' => SINGLE }, + 'area' => { 'href' => SINGLE }, + 'audio' => { 'src' => SINGLE }, + 'base' => { 'href' => SINGLE }, + 'blockquote' => { 'cite' => SINGLE }, + 'body' => { 'background' => SINGLE }, + 'button' => { 'formaction' => SINGLE }, + 'command' => { 'icon' => SINGLE }, + 'del' => { 'cite' => SINGLE }, + 'embed' => { 'src' => SINGLE }, + 'form' => { 'action' => SINGLE }, + 'frame' => { 'longdesc' => SINGLE, 'src' => SINGLE }, + 'head' => { 'profile' => SINGLE }, + 'html' => { 'manifest' => SINGLE }, + 'iframe' => { 'longdesc' => SINGLE, 'src' => SINGLE }, + 'img' => { 'longdesc' => SINGLE, 'src' => SINGLE, 'srcset' => SRCSET, 'usemap' => SINGLE }, + 'input' => { 'formaction' => SINGLE, 'src' => SINGLE, 'usemap' => SINGLE }, + 'ins' => { 'cite' => SINGLE }, + 'link' => { 'href' => SINGLE }, + 'object' => { 'archive' => MULTIPLE, 'classid' => SINGLE, 'codebase' => SINGLE, 'data' => SINGLE, 'usemap' => SINGLE }, + 'q' => { 'cite' => SINGLE }, + 'script' => { 'src' => SINGLE }, + 'source' => { 'src' => SINGLE, 'srcset' => SRCSET }, + 'video' => { 'poster' => SINGLE, 'src' => SINGLE }, + } + + URI_ELEMENTS_XPATH = '//*[%s]' % URI_ATTRIBUTES.keys.map { |name| "name()='#{name}'" }.join(' or ') + + module_function + + def transform(html, &block) + block or raise ArgumentError, 'block must be given' + + case html + when /\A\s*(?:<\?xml[\s?]|]/i + # Libxml2 automatically adds DOCTYPE and , so we need to + # skip them. + element_name = $1 + doc = Nokogiri::HTML::Document.parse(html) + yield doc + doc.at_xpath("//#{element_name}").xpath('self::node() | following-sibling::node()').to_s + else + doc = Nokogiri::HTML::Document.parse("#{html}") + yield doc + doc.xpath("/html/body/node()").to_s + end + end + + def replace_uris(html, &block) + block or raise ArgumentError, 'block must be given' + + transform(html) { |doc| + doc.xpath(URI_ELEMENTS_XPATH).each { |element| + uri_attrs = URI_ATTRIBUTES[element.name] or next + uri_attrs.each { |name, format| + attr = element.attribute(name) or next + case format + when SINGLE + attr.value = block.call(attr.value.strip) + when MULTIPLE + attr.value = attr.value.gsub(/(\S+)/) { block.call($1) } + when COMMA_SEPARATED, SRCSET + attr.value = attr.value.gsub(/((?:\A|,)\s*)(\S+)/) { $1 + block.call($2) } + end + } + } + } + end + end + + def self.rebase_hrefs(html, base_uri) + base_uri = normalize_uri(base_uri) + HTMLTransformer.replace_uris(html) { |url| + base_uri.merge(normalize_uri(url)).to_s + } + end end diff --git a/spec/concerns/liquid_interpolatable_spec.rb b/spec/concerns/liquid_interpolatable_spec.rb index cc7bbbf3..6c7c9a71 100644 --- a/spec/concerns/liquid_interpolatable_spec.rb +++ b/spec/concerns/liquid_interpolatable_spec.rb @@ -323,4 +323,42 @@ describe LiquidInterpolatable::Filters do end end end + + describe 'rebase_hrefs' do + let(:agent) { Agents::InterpolatableAgent.new(name: "test") } + + let(:fragment) { < +
  • + file1 +
  • +
  • + file2 +
  • +
  • + file3 +
  • + +HTML + + let(:replaced_fragment) { < +
  • + file1 +
  • +
  • + file2 +
  • +
  • + file3 +
  • + +HTML + + it 'rebases relative URLs in a fragment' do + agent.interpolation_context['content'] = fragment + agent.options['template'] = "{{ content | rebase_hrefs: 'http://example.com/support/files.html' }}" + expect(agent.interpolated['template']).to eq(replaced_fragment) + end + end end