mirror of
https://github.com/Fishwaldo/huginn.git
synced 2025-03-15 19:31:26 +00:00
Merge pull request #1743 from cantino/website_agent_can_interpolate_after_extraction
WebsiteAgent can interpolate after extraction Incorporating feedback from @cantino and @dsander.
This commit is contained in:
commit
91f096b16f
6 changed files with 222 additions and 15 deletions
|
@ -189,6 +189,11 @@ module LiquidInterpolatable
|
|||
url
|
||||
end
|
||||
|
||||
# Rebase URIs contained in attributes in a given HTML fragment
|
||||
def rebase_hrefs(input, base_uri)
|
||||
Utils.rebase_hrefs(input, base_uri) rescue input
|
||||
end
|
||||
|
||||
# Unescape (basic) HTML entities in a string
|
||||
#
|
||||
# This currently decodes the following entities only: "'",
|
||||
|
|
|
@ -111,9 +111,22 @@ module Agents
|
|||
|
||||
Set `http_success_codes` to an array of status codes (e.g., `[404, 422]`) to treat HTTP response codes beyond 200 as successes.
|
||||
|
||||
If a `template` option is given, it is used as a Liquid template for each event created by this Agent, instead of directly emitting the results of extraction as events. In the template, keys of extracted data can be interpolated, and some additional variables are also available as explained in the next section. For example:
|
||||
|
||||
"template": {
|
||||
"url": "{{ url }}",
|
||||
"title": "{{ title }}",
|
||||
"description": "{{ body_text }}",
|
||||
"last_modified": "{{ _response_.headers.Last-Modified | date: '%FT%T' }}"
|
||||
}
|
||||
|
||||
In the `on_change` mode, change is detected based on the resulted event payload after applying this option. If you want to add some keys to each event but ignore any change in them, set `mode` to `all` and put a DeDuplicationAgent downstream.
|
||||
|
||||
# Liquid Templating
|
||||
|
||||
In Liquid templating, the following variable is available:
|
||||
In Liquid templating, the following variables are available except when invoked by `data_from_event`:
|
||||
|
||||
* `_url_`: The URL specified to fetch the content from.
|
||||
|
||||
* `_response_`: A response object with the following keys:
|
||||
|
||||
|
@ -121,14 +134,18 @@ module Agents
|
|||
|
||||
* `headers`: Response headers; for example, `{{ _response_.headers.Content-Type }}` expands to the value of the Content-Type header. Keys are insensitive to cases and -/_.
|
||||
|
||||
* `url`: The final URL of the fetched page, following redirects. Using this in the `template` option, you can resolve relative URLs extracted from a document like `{{ link | to_uri: _request_.url }}` and `{{ content | rebase_hrefs: _request_.url }}`.
|
||||
|
||||
# Ordering Events
|
||||
|
||||
#{description_events_order}
|
||||
MD
|
||||
|
||||
event_description do
|
||||
keys = options['template'].presence || options['extract'].keys
|
||||
|
||||
"Events will have the following fields:\n\n %s" % [
|
||||
Utils.pretty_print(Hash[options['extract'].keys.map { |key|
|
||||
Utils.pretty_print(Hash[keys.map { |key|
|
||||
[key, "..."]
|
||||
}])
|
||||
]
|
||||
|
@ -157,6 +174,7 @@ module Agents
|
|||
errors.add(:base, "either url, url_from_event, or data_from_event are required") unless options['url'].present? || options['url_from_event'].present? || options['data_from_event'].present?
|
||||
errors.add(:base, "expected_update_period_in_days is required") unless options['expected_update_period_in_days'].present?
|
||||
validate_extract_options!
|
||||
validate_template_options!
|
||||
validate_http_success_codes!
|
||||
|
||||
# Check for optional fields
|
||||
|
@ -281,6 +299,15 @@ module Agents
|
|||
end
|
||||
end
|
||||
|
||||
def validate_template_options!
|
||||
template = options['template'].presence or return
|
||||
|
||||
unless Hash === template &&
|
||||
template.each_pair.all? { |key, value| String === value }
|
||||
errors.add(:base, 'template must be a hash of strings.')
|
||||
end
|
||||
end
|
||||
|
||||
def check
|
||||
check_urls(interpolated['url'])
|
||||
end
|
||||
|
@ -305,6 +332,7 @@ module Agents
|
|||
raise "Failed: #{response.inspect}" unless consider_response_successful?(response)
|
||||
|
||||
interpolation_context.stack {
|
||||
interpolation_context['_url_'] = uri.to_s
|
||||
interpolation_context['_response_'] = ResponseDrop.new(response)
|
||||
handle_data(response.body, response.env[:url], existing_payload)
|
||||
}
|
||||
|
@ -343,20 +371,33 @@ module Agents
|
|||
extract_xml(doc)
|
||||
end
|
||||
|
||||
num_unique_lengths = interpolated['extract'].keys.map { |name| output[name].length }.uniq
|
||||
|
||||
if num_unique_lengths.length != 1
|
||||
if output.each_value.each_cons(2).any? { |m, n| m.size != n.size }
|
||||
raise "Got an uneven number of matches for #{interpolated['name']}: #{interpolated['extract'].inspect}"
|
||||
end
|
||||
|
||||
old_events = previous_payloads num_unique_lengths.first
|
||||
num_unique_lengths.first.times do |index|
|
||||
result = {}
|
||||
interpolated['extract'].keys.each do |name|
|
||||
result[name] = output[name][index]
|
||||
if name.to_s == 'url' && url.present?
|
||||
result[name] = (url + Utils.normalize_uri(result[name])).to_s
|
||||
num_tuples = output.each_value.first.size
|
||||
|
||||
old_events = previous_payloads num_tuples
|
||||
|
||||
template = options['template'].presence
|
||||
|
||||
num_tuples.times do |index|
|
||||
extracted = {}
|
||||
interpolated['extract'].each_key do |name|
|
||||
extracted[name] = output[name][index]
|
||||
end
|
||||
|
||||
result =
|
||||
if template
|
||||
interpolate_with(extracted) do
|
||||
interpolate_options(template)
|
||||
end
|
||||
else
|
||||
extracted
|
||||
end
|
||||
|
||||
if payload_url = result['url'].presence
|
||||
result['url'] = (url + Utils.normalize_uri(payload_url)).to_s
|
||||
end
|
||||
|
||||
if store_payload!(old_events, result)
|
||||
|
@ -567,6 +608,11 @@ module Agents
|
|||
def status
|
||||
@object.status
|
||||
end
|
||||
|
||||
# The URL
|
||||
def url
|
||||
@object.env.url.to_s
|
||||
end
|
||||
end
|
||||
|
||||
# Wraps Faraday::Utils::Headers
|
||||
|
|
89
lib/utils.rb
89
lib/utils.rb
|
@ -170,4 +170,93 @@ module Utils
|
|||
nil
|
||||
end
|
||||
end
|
||||
|
||||
module HTMLTransformer
|
||||
SINGLE = 1
|
||||
MULTIPLE = 2
|
||||
COMMA_SEPARATED = 3
|
||||
SRCSET = 4
|
||||
|
||||
URI_ATTRIBUTES = {
|
||||
'a' => { 'href' => SINGLE },
|
||||
'applet' => { 'archive' => COMMA_SEPARATED, 'codebase' => SINGLE },
|
||||
'area' => { 'href' => SINGLE },
|
||||
'audio' => { 'src' => SINGLE },
|
||||
'base' => { 'href' => SINGLE },
|
||||
'blockquote' => { 'cite' => SINGLE },
|
||||
'body' => { 'background' => SINGLE },
|
||||
'button' => { 'formaction' => SINGLE },
|
||||
'command' => { 'icon' => SINGLE },
|
||||
'del' => { 'cite' => SINGLE },
|
||||
'embed' => { 'src' => SINGLE },
|
||||
'form' => { 'action' => SINGLE },
|
||||
'frame' => { 'longdesc' => SINGLE, 'src' => SINGLE },
|
||||
'head' => { 'profile' => SINGLE },
|
||||
'html' => { 'manifest' => SINGLE },
|
||||
'iframe' => { 'longdesc' => SINGLE, 'src' => SINGLE },
|
||||
'img' => { 'longdesc' => SINGLE, 'src' => SINGLE, 'srcset' => SRCSET, 'usemap' => SINGLE },
|
||||
'input' => { 'formaction' => SINGLE, 'src' => SINGLE, 'usemap' => SINGLE },
|
||||
'ins' => { 'cite' => SINGLE },
|
||||
'link' => { 'href' => SINGLE },
|
||||
'object' => { 'archive' => MULTIPLE, 'classid' => SINGLE, 'codebase' => SINGLE, 'data' => SINGLE, 'usemap' => SINGLE },
|
||||
'q' => { 'cite' => SINGLE },
|
||||
'script' => { 'src' => SINGLE },
|
||||
'source' => { 'src' => SINGLE, 'srcset' => SRCSET },
|
||||
'video' => { 'poster' => SINGLE, 'src' => SINGLE },
|
||||
}
|
||||
|
||||
URI_ELEMENTS_XPATH = '//*[%s]' % URI_ATTRIBUTES.keys.map { |name| "name()='#{name}'" }.join(' or ')
|
||||
|
||||
module_function
|
||||
|
||||
def transform(html, &block)
|
||||
block or raise ArgumentError, 'block must be given'
|
||||
|
||||
case html
|
||||
when /\A\s*(?:<\?xml[\s?]|<!DOCTYPE\s)/i
|
||||
doc = Nokogiri.parse(html)
|
||||
yield doc
|
||||
doc.to_s
|
||||
when /\A\s*<(html|head|body)[\s>]/i
|
||||
# Libxml2 automatically adds DOCTYPE and <html>, so we need to
|
||||
# skip them.
|
||||
element_name = $1
|
||||
doc = Nokogiri::HTML::Document.parse(html)
|
||||
yield doc
|
||||
doc.at_xpath("//#{element_name}").xpath('self::node() | following-sibling::node()').to_s
|
||||
else
|
||||
doc = Nokogiri::HTML::Document.parse("<html><body>#{html}")
|
||||
yield doc
|
||||
doc.xpath("/html/body/node()").to_s
|
||||
end
|
||||
end
|
||||
|
||||
def replace_uris(html, &block)
|
||||
block or raise ArgumentError, 'block must be given'
|
||||
|
||||
transform(html) { |doc|
|
||||
doc.xpath(URI_ELEMENTS_XPATH).each { |element|
|
||||
uri_attrs = URI_ATTRIBUTES[element.name] or next
|
||||
uri_attrs.each { |name, format|
|
||||
attr = element.attribute(name) or next
|
||||
case format
|
||||
when SINGLE
|
||||
attr.value = block.call(attr.value.strip)
|
||||
when MULTIPLE
|
||||
attr.value = attr.value.gsub(/(\S+)/) { block.call($1) }
|
||||
when COMMA_SEPARATED, SRCSET
|
||||
attr.value = attr.value.gsub(/((?:\A|,)\s*)(\S+)/) { $1 + block.call($2) }
|
||||
end
|
||||
}
|
||||
}
|
||||
}
|
||||
end
|
||||
end
|
||||
|
||||
def self.rebase_hrefs(html, base_uri)
|
||||
base_uri = normalize_uri(base_uri)
|
||||
HTMLTransformer.replace_uris(html) { |url|
|
||||
base_uri.merge(normalize_uri(url)).to_s
|
||||
}
|
||||
end
|
||||
end
|
||||
|
|
|
@ -323,4 +323,42 @@ describe LiquidInterpolatable::Filters do
|
|||
end
|
||||
end
|
||||
end
|
||||
|
||||
describe 'rebase_hrefs' do
|
||||
let(:agent) { Agents::InterpolatableAgent.new(name: "test") }
|
||||
|
||||
let(:fragment) { <<HTML }
|
||||
<ul>
|
||||
<li>
|
||||
<a href="downloads/file1"><img src="/images/iconA.png" srcset="/images/iconA.png 1x, /images/iconA@2x.png 2x">file1</a>
|
||||
</li>
|
||||
<li>
|
||||
<a href="downloads/file2"><img src="/images/iconA.png" srcset="/images/iconA.png 1x, /images/iconA@2x.png 2x">file2</a>
|
||||
</li>
|
||||
<li>
|
||||
<a href="downloads/file3"><img src="/images/iconB.png" srcset="/images/iconB.png 1x, /images/iconB@2x.png 2x">file3</a>
|
||||
</li>
|
||||
</ul>
|
||||
HTML
|
||||
|
||||
let(:replaced_fragment) { <<HTML }
|
||||
<ul>
|
||||
<li>
|
||||
<a href="http://example.com/support/downloads/file1"><img src="http://example.com/images/iconA.png" srcset="http://example.com/images/iconA.png 1x, http://example.com/images/iconA@2x.png 2x">file1</a>
|
||||
</li>
|
||||
<li>
|
||||
<a href="http://example.com/support/downloads/file2"><img src="http://example.com/images/iconA.png" srcset="http://example.com/images/iconA.png 1x, http://example.com/images/iconA@2x.png 2x">file2</a>
|
||||
</li>
|
||||
<li>
|
||||
<a href="http://example.com/support/downloads/file3"><img src="http://example.com/images/iconB.png" srcset="http://example.com/images/iconB.png 1x, http://example.com/images/iconB@2x.png 2x">file3</a>
|
||||
</li>
|
||||
</ul>
|
||||
HTML
|
||||
|
||||
it 'rebases relative URLs in a fragment' do
|
||||
agent.interpolation_context['content'] = fragment
|
||||
agent.options['template'] = "{{ content | rebase_hrefs: 'http://example.com/support/files.html' }}"
|
||||
expect(agent.interpolated['template']).to eq(replaced_fragment)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -8,6 +8,10 @@ describe Agents::WebsiteAgent do
|
|||
headers: {
|
||||
'X-Status-Message' => 'OK'
|
||||
})
|
||||
stub_request(:any, /xkcd\.com\/index$/).to_return(status: 301,
|
||||
headers: {
|
||||
'Location' => 'http://xkcd.com/'
|
||||
})
|
||||
@valid_options = {
|
||||
'name' => "XKCD",
|
||||
'expected_update_period_in_days' => "2",
|
||||
|
@ -729,14 +733,36 @@ describe Agents::WebsiteAgent do
|
|||
end
|
||||
|
||||
it "should interpolate _response_" do
|
||||
@valid_options['url'] = 'http://xkcd.com/index'
|
||||
@valid_options['extract']['response_info'] =
|
||||
@valid_options['extract']['url'].merge(
|
||||
'value' => '"{{ "The reponse was " | append:_response_.status | append:" " | append:_response_.headers.X-Status-Message | append:"." }}"'
|
||||
'value' => '{{ "The reponse from " | append:_response_.url | append:" was " | append:_response_.status | append:" " | append:_response_.headers.X-Status-Message | append:"." | to_xpath }}'
|
||||
)
|
||||
@valid_options['extract']['original_url'] =
|
||||
@valid_options['extract']['url'].merge(
|
||||
'value' => '{{ _url_ | to_xpath }}'
|
||||
)
|
||||
@checker.options = @valid_options
|
||||
@checker.check
|
||||
event = Event.last
|
||||
expect(event.payload['response_info']).to eq('The reponse was 200 OK.')
|
||||
expect(event.payload['response_info']).to eq('The reponse from http://xkcd.com/ was 200 OK.')
|
||||
expect(event.payload['original_url']).to eq('http://xkcd.com/index')
|
||||
end
|
||||
|
||||
it "should be formatted by template after extraction" do
|
||||
@valid_options['template'] = {
|
||||
'url' => '{{url}}',
|
||||
'title' => '{{title | upcase}}',
|
||||
'summary' => '{{title}}: {{hovertext | truncate: 20}}',
|
||||
}
|
||||
@checker.options = @valid_options
|
||||
@checker.check
|
||||
event = Event.last
|
||||
expect(event.payload).to eq({
|
||||
'title' => 'EVOLVING',
|
||||
'url' => 'http://imgs.xkcd.com/comics/evolving.png',
|
||||
'summary' => 'Evolving: Biologists play r...',
|
||||
})
|
||||
end
|
||||
|
||||
describe "XML" do
|
||||
|
|
|
@ -130,7 +130,10 @@ shared_examples_for AgentControllerConcern do
|
|||
end
|
||||
|
||||
it "should configure targets with nested objects" do
|
||||
agent.control_targets << agents(:bob_data_output_agent)
|
||||
agent.control_targets = [
|
||||
agents(:bob_basecamp_agent), # does not support a `template` option, but anyway
|
||||
agents(:bob_data_output_agent)
|
||||
]
|
||||
agent.options['action'] = 'configure'
|
||||
agent.options['configure_options'] = {
|
||||
template: {
|
||||
|
|
Loading…
Add table
Reference in a new issue