Merge pull request #1769 from cantino/website_agent_repeat_option

Add a `repeat` option for extractors to WebsiteAgent
This commit is contained in:
Akinori MUSHA 2016-11-20 19:16:36 +09:00 committed by GitHub
commit c575af959b
2 changed files with 83 additions and 31 deletions

View file

@ -35,7 +35,9 @@ module Agents
To tell the Agent how to parse the content, specify `extract` as a hash with keys naming the extractions and values of hashes.
Note that for all of the formats, whatever you extract MUST have the same number of matches for each extractor. E.g., if you're extracting rows, all extractors must match all rows. For generating CSS selectors, something like [SelectorGadget](http://selectorgadget.com) may be helpful.
Note that for all of the formats, whatever you extract MUST have the same number of matches for each extractor except when it has `repeat` set to true. E.g., if you're extracting rows, all extractors must match all rows. For generating CSS selectors, something like [SelectorGadget](http://selectorgadget.com) may be helpful.
For extractors with `repeat` set to true, their first matches will be included in all extracts. This is useful such as when you want to include the title of a page in all events created from the page.
# Scraping HTML and XML
@ -44,7 +46,8 @@ module Agents
"extract": {
"url": { "css": "#comic img", "value": "@src" },
"title": { "css": "#comic img", "value": "@title" },
"body_text": { "css": "div.main", "value": "string(.)" }
"body_text": { "css": "div.main", "value": "string(.)" },
"page_title": { "css": "title", "value": "string(.)", "repeat": true }
}
"@_attr_" is the XPath expression to extract the value of an attribute named _attr_ from a node, and `string(.)` gives a string with all the enclosed text nodes concatenated without entity escaping (such as `&`). To extract the innerHTML, use `./node()`; and to extract the outer HTML, use `.`.
@ -379,21 +382,24 @@ module Agents
extract_xml(doc)
end
if output.each_value.each_cons(2).any? { |m, n| m.size != n.size }
raise "Got an uneven number of matches for #{interpolated['name']}: #{interpolated['extract'].inspect}"
end
num_tuples = output.each_value.first.size
num_tuples = output.each_value.inject(nil) { |num, value|
case size = value.size
when Float::INFINITY
num
when Integer
if num && num != size
raise "Got an uneven number of matches for #{interpolated['name']}: #{interpolated['extract'].inspect}"
end
size
end
} or raise "At least one non-repeat key is required"
old_events = previous_payloads num_tuples
template = options['template'].presence
num_tuples.times do |index|
extracted = {}
interpolated['extract'].each_key do |name|
extracted[name] = output[name][index]
end
num_tuples.times.zip(*output.values) do |index, *values|
extracted = output.each_key.lazy.zip(values).to_h
result =
if template
@ -523,36 +529,44 @@ module Agents
def extract_each(&block)
interpolated['extract'].each_with_object({}) { |(name, extraction_details), output|
output[name] = block.call(extraction_details)
if boolify(extraction_details['repeat'])
values = Repeater.new { |repeater|
block.call(extraction_details, repeater)
}
else
values = []
block.call(extraction_details, values)
end
log "Values extracted: #{values}"
output[name] = values
}
end
def extract_json(doc)
extract_each { |extraction_details|
result = Utils.values_at(doc, extraction_details['path'])
log "Extracting #{extraction_type} at #{extraction_details['path']}: #{result}"
result
extract_each { |extraction_details, values|
log "Extracting #{extraction_type} at #{extraction_details['path']}"
Utils.values_at(doc, extraction_details['path']).each { |value|
values << value
}
}
end
def extract_text(doc)
extract_each { |extraction_details|
extract_each { |extraction_details, values|
regexp = Regexp.new(extraction_details['regexp'])
log "Extracting #{extraction_type} with #{regexp}"
case index = extraction_details['index']
when /\A\d+\z/
index = index.to_i
end
result = []
doc.scan(regexp) {
result << Regexp.last_match[index]
values << Regexp.last_match[index]
}
log "Extracting #{extraction_type} at #{regexp}: #{result}"
result
}
end
def extract_xml(doc)
extract_each { |extraction_details|
extract_each { |extraction_details, values|
case
when css = extraction_details['css']
nodes = doc.css(css)
@ -561,22 +575,21 @@ module Agents
else
raise '"css" or "xpath" is required for HTML or XML extraction'
end
log "Extracting #{extraction_type} at #{xpath || css}"
case nodes
when Nokogiri::XML::NodeSet
result = nodes.map { |node|
nodes.each { |node|
case value = node.xpath(extraction_details['value'] || '.')
when Float
# Node#xpath() returns any numeric value as float;
# convert it to integer as appropriate.
value = value.to_i if value.to_i == value
end
value.to_s
values << value.to_s
}
else
raise "The result of HTML/XML extraction was not a NodeSet"
end
log "Extracting #{extraction_type} at #{xpath || css}: #{result}"
result
}
end
@ -604,6 +617,31 @@ module Agents
false
end
class Repeater < Enumerator
# Repeater.new { |y|
# # ...
# y << value
# } #=> [value, ...]
def initialize(&block)
@value = nil
super(Float::INFINITY) { |y|
loop { y << @value }
}
catch(@done = Object.new) {
block.call(self)
}
end
def <<(value)
@value = value
throw @done
end
def to_s
"[#{@value.inspect}, ...]"
end
end
# Wraps Faraday::Response
class ResponseDrop < LiquidDroppable::Drop
def headers

View file

@ -786,6 +786,7 @@ describe Agents::WebsiteAgent do
'title' => { 'xpath' => '/feed/entry', 'value' => 'normalize-space(./title)' },
'url' => { 'xpath' => '/feed/entry', 'value' => './link[1]/@href' },
'thumbnail' => { 'xpath' => '/feed/entry', 'value' => './thumbnail/@url' },
'page_title': { 'xpath': '/feed/title', 'value': 'string(.)', 'repeat' => true }
}
}, keep_events_for: 2.days)
@checker.user = users(:bob)
@ -796,7 +797,10 @@ describe Agents::WebsiteAgent do
expect {
@checker.check
}.to change { Event.count }.by(20)
event = Event.last
events = Event.last(20)
expect(events.size).to eq(20)
expect(events.map { |event| event.payload['page_title'] }.uniq).to eq(['Recent Commits to huginn:master'])
event = events.last
expect(event.payload['title']).to eq('Shift to dev group')
expect(event.payload['url']).to eq('https://github.com/cantino/huginn/commit/d465158f77dcd9078697e6167b50abbfdfa8b1af')
expect(event.payload['thumbnail']).to eq('https://avatars3.githubusercontent.com/u/365751?s=30')
@ -942,6 +946,7 @@ describe Agents::WebsiteAgent do
it "can handle arrays" do
json = {
'response' => {
'status' => 'ok',
'data' => [
{'title' => "first", 'version' => 2},
{'title' => "second", 'version' => 2.5}
@ -956,8 +961,9 @@ describe Agents::WebsiteAgent do
'url' => "http://json-site.com",
'mode' => 'on_change',
'extract' => {
:title => {'path' => "response.data[*].title"},
:version => {'path' => "response.data[*].version"}
'title' => { 'path' => "response.data[*].title" },
'version' => { 'path' => "response.data[*].version" },
'status' => { 'path' => "response.status", 'repeat' => true },
}
}
checker = Agents::WebsiteAgent.new(:name => "Weather Site", :options => site)
@ -969,9 +975,11 @@ describe Agents::WebsiteAgent do
}.to change { Event.count }.by(2)
(event2, event1) = Event.last(2)
expect(event1.payload['status']).to eq('ok')
expect(event1.payload['version']).to eq(2.5)
expect(event1.payload['title']).to eq("second")
expect(event2.payload['status']).to eq('ok')
expect(event2.payload['version']).to eq(2)
expect(event2.payload['title']).to eq("first")
end
@ -1007,6 +1015,7 @@ describe Agents::WebsiteAgent do
describe "text parsing" do
before do
stub_request(:any, /text-site/).to_return(body: <<-EOF, status: 200)
VERSION 1
water: wet
fire: hot
EOF
@ -1017,6 +1026,7 @@ fire: hot
'url' => 'http://text-site.com',
'mode' => 'on_change',
'extract' => {
'version' => { 'regexp' => '^VERSION (.+)$', index: 1, repeat: true },
'word' => { 'regexp' => '^(.+?): (.+)$', index: 1 },
'property' => { 'regexp' => '^(.+?): (.+)$', index: '2' },
}
@ -1027,7 +1037,7 @@ fire: hot
end
it "works with regexp with named capture" do
@checker.options = @checker.options.merge('extract' => {
@checker.options = @checker.options.deep_merge('extract' => {
'word' => { 'regexp' => '^(?<word>.+?): (?<property>.+)$', index: 'word' },
'property' => { 'regexp' => '^(?<word>.+?): (?<property>.+)$', index: 'property' },
})
@ -1037,8 +1047,10 @@ fire: hot
}.to change { Event.count }.by(2)
event1, event2 = Event.last(2)
expect(event1.payload['version']).to eq('1')
expect(event1.payload['word']).to eq('water')
expect(event1.payload['property']).to eq('wet')
expect(event2.payload['version']).to eq('1')
expect(event2.payload['word']).to eq('fire')
expect(event2.payload['property']).to eq('hot')
end
@ -1049,8 +1061,10 @@ fire: hot
}.to change { Event.count }.by(2)
event1, event2 = Event.last(2)
expect(event1.payload['version']).to eq('1')
expect(event1.payload['word']).to eq('water')
expect(event1.payload['property']).to eq('wet')
expect(event2.payload['version']).to eq('1')
expect(event2.payload['word']).to eq('fire')
expect(event2.payload['property']).to eq('hot')
end