Merge pull request #1743 from cantino/website_agent_can_interpolate_after_extraction

WebsiteAgent can interpolate after extraction

Incorporating feedback from @cantino and @dsander.
This commit is contained in:
Akinori MUSHA 2016-11-01 20:20:37 +09:00 committed by GitHub
commit 91f096b16f
6 changed files with 222 additions and 15 deletions

View file

@ -189,6 +189,11 @@ module LiquidInterpolatable
url
end
# Rebase URIs contained in attributes in a given HTML fragment
def rebase_hrefs(input, base_uri)
Utils.rebase_hrefs(input, base_uri) rescue input
end
# Unescape (basic) HTML entities in a string
#
# This currently decodes the following entities only: "'",

View file

@ -111,9 +111,22 @@ module Agents
Set `http_success_codes` to an array of status codes (e.g., `[404, 422]`) to treat HTTP response codes beyond 200 as successes.
If a `template` option is given, it is used as a Liquid template for each event created by this Agent, instead of directly emitting the results of extraction as events. In the template, keys of extracted data can be interpolated, and some additional variables are also available as explained in the next section. For example:
"template": {
"url": "{{ url }}",
"title": "{{ title }}",
"description": "{{ body_text }}",
"last_modified": "{{ _response_.headers.Last-Modified | date: '%FT%T' }}"
}
In the `on_change` mode, change is detected based on the resulted event payload after applying this option. If you want to add some keys to each event but ignore any change in them, set `mode` to `all` and put a DeDuplicationAgent downstream.
# Liquid Templating
In Liquid templating, the following variable is available:
In Liquid templating, the following variables are available except when invoked by `data_from_event`:
* `_url_`: The URL specified to fetch the content from.
* `_response_`: A response object with the following keys:
@ -121,14 +134,18 @@ module Agents
* `headers`: Response headers; for example, `{{ _response_.headers.Content-Type }}` expands to the value of the Content-Type header. Keys are insensitive to cases and -/_.
* `url`: The final URL of the fetched page, following redirects. Using this in the `template` option, you can resolve relative URLs extracted from a document like `{{ link | to_uri: _request_.url }}` and `{{ content | rebase_hrefs: _request_.url }}`.
# Ordering Events
#{description_events_order}
MD
event_description do
keys = options['template'].presence || options['extract'].keys
"Events will have the following fields:\n\n %s" % [
Utils.pretty_print(Hash[options['extract'].keys.map { |key|
Utils.pretty_print(Hash[keys.map { |key|
[key, "..."]
}])
]
@ -157,6 +174,7 @@ module Agents
errors.add(:base, "either url, url_from_event, or data_from_event are required") unless options['url'].present? || options['url_from_event'].present? || options['data_from_event'].present?
errors.add(:base, "expected_update_period_in_days is required") unless options['expected_update_period_in_days'].present?
validate_extract_options!
validate_template_options!
validate_http_success_codes!
# Check for optional fields
@ -281,6 +299,15 @@ module Agents
end
end
def validate_template_options!
template = options['template'].presence or return
unless Hash === template &&
template.each_pair.all? { |key, value| String === value }
errors.add(:base, 'template must be a hash of strings.')
end
end
def check
check_urls(interpolated['url'])
end
@ -305,6 +332,7 @@ module Agents
raise "Failed: #{response.inspect}" unless consider_response_successful?(response)
interpolation_context.stack {
interpolation_context['_url_'] = uri.to_s
interpolation_context['_response_'] = ResponseDrop.new(response)
handle_data(response.body, response.env[:url], existing_payload)
}
@ -343,20 +371,33 @@ module Agents
extract_xml(doc)
end
num_unique_lengths = interpolated['extract'].keys.map { |name| output[name].length }.uniq
if num_unique_lengths.length != 1
if output.each_value.each_cons(2).any? { |m, n| m.size != n.size }
raise "Got an uneven number of matches for #{interpolated['name']}: #{interpolated['extract'].inspect}"
end
old_events = previous_payloads num_unique_lengths.first
num_unique_lengths.first.times do |index|
result = {}
interpolated['extract'].keys.each do |name|
result[name] = output[name][index]
if name.to_s == 'url' && url.present?
result[name] = (url + Utils.normalize_uri(result[name])).to_s
num_tuples = output.each_value.first.size
old_events = previous_payloads num_tuples
template = options['template'].presence
num_tuples.times do |index|
extracted = {}
interpolated['extract'].each_key do |name|
extracted[name] = output[name][index]
end
result =
if template
interpolate_with(extracted) do
interpolate_options(template)
end
else
extracted
end
if payload_url = result['url'].presence
result['url'] = (url + Utils.normalize_uri(payload_url)).to_s
end
if store_payload!(old_events, result)
@ -567,6 +608,11 @@ module Agents
def status
@object.status
end
# The URL
def url
@object.env.url.to_s
end
end
# Wraps Faraday::Utils::Headers

View file

@ -170,4 +170,93 @@ module Utils
nil
end
end
module HTMLTransformer
SINGLE = 1
MULTIPLE = 2
COMMA_SEPARATED = 3
SRCSET = 4
URI_ATTRIBUTES = {
'a' => { 'href' => SINGLE },
'applet' => { 'archive' => COMMA_SEPARATED, 'codebase' => SINGLE },
'area' => { 'href' => SINGLE },
'audio' => { 'src' => SINGLE },
'base' => { 'href' => SINGLE },
'blockquote' => { 'cite' => SINGLE },
'body' => { 'background' => SINGLE },
'button' => { 'formaction' => SINGLE },
'command' => { 'icon' => SINGLE },
'del' => { 'cite' => SINGLE },
'embed' => { 'src' => SINGLE },
'form' => { 'action' => SINGLE },
'frame' => { 'longdesc' => SINGLE, 'src' => SINGLE },
'head' => { 'profile' => SINGLE },
'html' => { 'manifest' => SINGLE },
'iframe' => { 'longdesc' => SINGLE, 'src' => SINGLE },
'img' => { 'longdesc' => SINGLE, 'src' => SINGLE, 'srcset' => SRCSET, 'usemap' => SINGLE },
'input' => { 'formaction' => SINGLE, 'src' => SINGLE, 'usemap' => SINGLE },
'ins' => { 'cite' => SINGLE },
'link' => { 'href' => SINGLE },
'object' => { 'archive' => MULTIPLE, 'classid' => SINGLE, 'codebase' => SINGLE, 'data' => SINGLE, 'usemap' => SINGLE },
'q' => { 'cite' => SINGLE },
'script' => { 'src' => SINGLE },
'source' => { 'src' => SINGLE, 'srcset' => SRCSET },
'video' => { 'poster' => SINGLE, 'src' => SINGLE },
}
URI_ELEMENTS_XPATH = '//*[%s]' % URI_ATTRIBUTES.keys.map { |name| "name()='#{name}'" }.join(' or ')
module_function
def transform(html, &block)
block or raise ArgumentError, 'block must be given'
case html
when /\A\s*(?:<\?xml[\s?]|<!DOCTYPE\s)/i
doc = Nokogiri.parse(html)
yield doc
doc.to_s
when /\A\s*<(html|head|body)[\s>]/i
# Libxml2 automatically adds DOCTYPE and <html>, so we need to
# skip them.
element_name = $1
doc = Nokogiri::HTML::Document.parse(html)
yield doc
doc.at_xpath("//#{element_name}").xpath('self::node() | following-sibling::node()').to_s
else
doc = Nokogiri::HTML::Document.parse("<html><body>#{html}")
yield doc
doc.xpath("/html/body/node()").to_s
end
end
def replace_uris(html, &block)
block or raise ArgumentError, 'block must be given'
transform(html) { |doc|
doc.xpath(URI_ELEMENTS_XPATH).each { |element|
uri_attrs = URI_ATTRIBUTES[element.name] or next
uri_attrs.each { |name, format|
attr = element.attribute(name) or next
case format
when SINGLE
attr.value = block.call(attr.value.strip)
when MULTIPLE
attr.value = attr.value.gsub(/(\S+)/) { block.call($1) }
when COMMA_SEPARATED, SRCSET
attr.value = attr.value.gsub(/((?:\A|,)\s*)(\S+)/) { $1 + block.call($2) }
end
}
}
}
end
end
def self.rebase_hrefs(html, base_uri)
base_uri = normalize_uri(base_uri)
HTMLTransformer.replace_uris(html) { |url|
base_uri.merge(normalize_uri(url)).to_s
}
end
end

View file

@ -323,4 +323,42 @@ describe LiquidInterpolatable::Filters do
end
end
end
describe 'rebase_hrefs' do
let(:agent) { Agents::InterpolatableAgent.new(name: "test") }
let(:fragment) { <<HTML }
<ul>
<li>
<a href="downloads/file1"><img src="/images/iconA.png" srcset="/images/iconA.png 1x, /images/iconA@2x.png 2x">file1</a>
</li>
<li>
<a href="downloads/file2"><img src="/images/iconA.png" srcset="/images/iconA.png 1x, /images/iconA@2x.png 2x">file2</a>
</li>
<li>
<a href="downloads/file3"><img src="/images/iconB.png" srcset="/images/iconB.png 1x, /images/iconB@2x.png 2x">file3</a>
</li>
</ul>
HTML
let(:replaced_fragment) { <<HTML }
<ul>
<li>
<a href="http://example.com/support/downloads/file1"><img src="http://example.com/images/iconA.png" srcset="http://example.com/images/iconA.png 1x, http://example.com/images/iconA@2x.png 2x">file1</a>
</li>
<li>
<a href="http://example.com/support/downloads/file2"><img src="http://example.com/images/iconA.png" srcset="http://example.com/images/iconA.png 1x, http://example.com/images/iconA@2x.png 2x">file2</a>
</li>
<li>
<a href="http://example.com/support/downloads/file3"><img src="http://example.com/images/iconB.png" srcset="http://example.com/images/iconB.png 1x, http://example.com/images/iconB@2x.png 2x">file3</a>
</li>
</ul>
HTML
it 'rebases relative URLs in a fragment' do
agent.interpolation_context['content'] = fragment
agent.options['template'] = "{{ content | rebase_hrefs: 'http://example.com/support/files.html' }}"
expect(agent.interpolated['template']).to eq(replaced_fragment)
end
end
end

View file

@ -8,6 +8,10 @@ describe Agents::WebsiteAgent do
headers: {
'X-Status-Message' => 'OK'
})
stub_request(:any, /xkcd\.com\/index$/).to_return(status: 301,
headers: {
'Location' => 'http://xkcd.com/'
})
@valid_options = {
'name' => "XKCD",
'expected_update_period_in_days' => "2",
@ -729,14 +733,36 @@ describe Agents::WebsiteAgent do
end
it "should interpolate _response_" do
@valid_options['url'] = 'http://xkcd.com/index'
@valid_options['extract']['response_info'] =
@valid_options['extract']['url'].merge(
'value' => '"{{ "The reponse was " | append:_response_.status | append:" " | append:_response_.headers.X-Status-Message | append:"." }}"'
'value' => '{{ "The reponse from " | append:_response_.url | append:" was " | append:_response_.status | append:" " | append:_response_.headers.X-Status-Message | append:"." | to_xpath }}'
)
@valid_options['extract']['original_url'] =
@valid_options['extract']['url'].merge(
'value' => '{{ _url_ | to_xpath }}'
)
@checker.options = @valid_options
@checker.check
event = Event.last
expect(event.payload['response_info']).to eq('The reponse was 200 OK.')
expect(event.payload['response_info']).to eq('The reponse from http://xkcd.com/ was 200 OK.')
expect(event.payload['original_url']).to eq('http://xkcd.com/index')
end
it "should be formatted by template after extraction" do
@valid_options['template'] = {
'url' => '{{url}}',
'title' => '{{title | upcase}}',
'summary' => '{{title}}: {{hovertext | truncate: 20}}',
}
@checker.options = @valid_options
@checker.check
event = Event.last
expect(event.payload).to eq({
'title' => 'EVOLVING',
'url' => 'http://imgs.xkcd.com/comics/evolving.png',
'summary' => 'Evolving: Biologists play r...',
})
end
describe "XML" do

View file

@ -130,7 +130,10 @@ shared_examples_for AgentControllerConcern do
end
it "should configure targets with nested objects" do
agent.control_targets << agents(:bob_data_output_agent)
agent.control_targets = [
agents(:bob_basecamp_agent), # does not support a `template` option, but anyway
agents(:bob_data_output_agent)
]
agent.options['action'] = 'configure'
agent.options['configure_options'] = {
template: {