diff --git a/app/models/agents/website_agent.rb b/app/models/agents/website_agent.rb index 5bd7672e..437df864 100644 --- a/app/models/agents/website_agent.rb +++ b/app/models/agents/website_agent.rb @@ -35,7 +35,9 @@ module Agents To tell the Agent how to parse the content, specify `extract` as a hash with keys naming the extractions and values of hashes. - Note that for all of the formats, whatever you extract MUST have the same number of matches for each extractor. E.g., if you're extracting rows, all extractors must match all rows. For generating CSS selectors, something like [SelectorGadget](http://selectorgadget.com) may be helpful. + Note that for all of the formats, whatever you extract MUST have the same number of matches for each extractor except when it has `repeat` set to true. E.g., if you're extracting rows, all extractors must match all rows. For generating CSS selectors, something like [SelectorGadget](http://selectorgadget.com) may be helpful. + + For extractors with `repeat` set to true, their first matches will be included in all extracts. This is useful such as when you want to include the title of a page in all events created from the page. # Scraping HTML and XML @@ -44,7 +46,8 @@ module Agents "extract": { "url": { "css": "#comic img", "value": "@src" }, "title": { "css": "#comic img", "value": "@title" }, - "body_text": { "css": "div.main", "value": "string(.)" } + "body_text": { "css": "div.main", "value": "string(.)" }, + "page_title": { "css": "title", "value": "string(.)", "repeat": true } } "@_attr_" is the XPath expression to extract the value of an attribute named _attr_ from a node, and `string(.)` gives a string with all the enclosed text nodes concatenated without entity escaping (such as `&`). To extract the innerHTML, use `./node()`; and to extract the outer HTML, use `.`. @@ -379,21 +382,24 @@ module Agents extract_xml(doc) end - if output.each_value.each_cons(2).any? { |m, n| m.size != n.size } - raise "Got an uneven number of matches for #{interpolated['name']}: #{interpolated['extract'].inspect}" - end - - num_tuples = output.each_value.first.size + num_tuples = output.each_value.inject(nil) { |num, value| + case size = value.size + when Float::INFINITY + num + when Integer + if num && num != size + raise "Got an uneven number of matches for #{interpolated['name']}: #{interpolated['extract'].inspect}" + end + size + end + } or raise "At least one non-repeat key is required" old_events = previous_payloads num_tuples template = options['template'].presence - num_tuples.times do |index| - extracted = {} - interpolated['extract'].each_key do |name| - extracted[name] = output[name][index] - end + num_tuples.times.zip(*output.values) do |index, *values| + extracted = output.each_key.lazy.zip(values).to_h result = if template @@ -523,8 +529,14 @@ module Agents def extract_each(&block) interpolated['extract'].each_with_object({}) { |(name, extraction_details), output| - values = [] - block.call(extraction_details, values) + if boolify(extraction_details['repeat']) + values = Repeater.new { |repeater| + block.call(extraction_details, repeater) + } + else + values = [] + block.call(extraction_details, values) + end log "Values extracted: #{values}" output[name] = values } @@ -605,6 +617,31 @@ module Agents false end + class Repeater < Enumerator + # Repeater.new { |y| + # # ... + # y << value + # } #=> [value, ...] + def initialize(&block) + @value = nil + super(Float::INFINITY) { |y| + loop { y << @value } + } + catch(@done = Object.new) { + block.call(self) + } + end + + def <<(value) + @value = value + throw @done + end + + def to_s + "[#{@value.inspect}, ...]" + end + end + # Wraps Faraday::Response class ResponseDrop < LiquidDroppable::Drop def headers diff --git a/spec/models/agents/website_agent_spec.rb b/spec/models/agents/website_agent_spec.rb index f61f71bc..9f25752d 100644 --- a/spec/models/agents/website_agent_spec.rb +++ b/spec/models/agents/website_agent_spec.rb @@ -786,6 +786,7 @@ describe Agents::WebsiteAgent do 'title' => { 'xpath' => '/feed/entry', 'value' => 'normalize-space(./title)' }, 'url' => { 'xpath' => '/feed/entry', 'value' => './link[1]/@href' }, 'thumbnail' => { 'xpath' => '/feed/entry', 'value' => './thumbnail/@url' }, + 'page_title': { 'xpath': '/feed/title', 'value': 'string(.)', 'repeat' => true } } }, keep_events_for: 2.days) @checker.user = users(:bob) @@ -796,7 +797,10 @@ describe Agents::WebsiteAgent do expect { @checker.check }.to change { Event.count }.by(20) - event = Event.last + events = Event.last(20) + expect(events.size).to eq(20) + expect(events.map { |event| event.payload['page_title'] }.uniq).to eq(['Recent Commits to huginn:master']) + event = events.last expect(event.payload['title']).to eq('Shift to dev group') expect(event.payload['url']).to eq('https://github.com/cantino/huginn/commit/d465158f77dcd9078697e6167b50abbfdfa8b1af') expect(event.payload['thumbnail']).to eq('https://avatars3.githubusercontent.com/u/365751?s=30') @@ -942,6 +946,7 @@ describe Agents::WebsiteAgent do it "can handle arrays" do json = { 'response' => { + 'status' => 'ok', 'data' => [ {'title' => "first", 'version' => 2}, {'title' => "second", 'version' => 2.5} @@ -956,8 +961,9 @@ describe Agents::WebsiteAgent do 'url' => "http://json-site.com", 'mode' => 'on_change', 'extract' => { - :title => {'path' => "response.data[*].title"}, - :version => {'path' => "response.data[*].version"} + 'title' => { 'path' => "response.data[*].title" }, + 'version' => { 'path' => "response.data[*].version" }, + 'status' => { 'path' => "response.status", 'repeat' => true }, } } checker = Agents::WebsiteAgent.new(:name => "Weather Site", :options => site) @@ -969,9 +975,11 @@ describe Agents::WebsiteAgent do }.to change { Event.count }.by(2) (event2, event1) = Event.last(2) + expect(event1.payload['status']).to eq('ok') expect(event1.payload['version']).to eq(2.5) expect(event1.payload['title']).to eq("second") + expect(event2.payload['status']).to eq('ok') expect(event2.payload['version']).to eq(2) expect(event2.payload['title']).to eq("first") end @@ -1007,6 +1015,7 @@ describe Agents::WebsiteAgent do describe "text parsing" do before do stub_request(:any, /text-site/).to_return(body: <<-EOF, status: 200) +VERSION 1 water: wet fire: hot EOF @@ -1017,6 +1026,7 @@ fire: hot 'url' => 'http://text-site.com', 'mode' => 'on_change', 'extract' => { + 'version' => { 'regexp' => '^VERSION (.+)$', index: 1, repeat: true }, 'word' => { 'regexp' => '^(.+?): (.+)$', index: 1 }, 'property' => { 'regexp' => '^(.+?): (.+)$', index: '2' }, } @@ -1027,7 +1037,7 @@ fire: hot end it "works with regexp with named capture" do - @checker.options = @checker.options.merge('extract' => { + @checker.options = @checker.options.deep_merge('extract' => { 'word' => { 'regexp' => '^(?.+?): (?.+)$', index: 'word' }, 'property' => { 'regexp' => '^(?.+?): (?.+)$', index: 'property' }, }) @@ -1037,8 +1047,10 @@ fire: hot }.to change { Event.count }.by(2) event1, event2 = Event.last(2) + expect(event1.payload['version']).to eq('1') expect(event1.payload['word']).to eq('water') expect(event1.payload['property']).to eq('wet') + expect(event2.payload['version']).to eq('1') expect(event2.payload['word']).to eq('fire') expect(event2.payload['property']).to eq('hot') end @@ -1049,8 +1061,10 @@ fire: hot }.to change { Event.count }.by(2) event1, event2 = Event.last(2) + expect(event1.payload['version']).to eq('1') expect(event1.payload['word']).to eq('water') expect(event1.payload['property']).to eq('wet') + expect(event2.payload['version']).to eq('1') expect(event2.payload['word']).to eq('fire') expect(event2.payload['property']).to eq('hot') end