From e3f79bf84bb6d2593aef563133edcaa32ae47d38 Mon Sep 17 00:00:00 2001 From: Warloxx Date: Mon, 24 Apr 2017 00:44:03 +0200 Subject: [PATCH] Rss agent dynamic cleanup (#1733) * Remove fixed limit of 500 IDs used to figure out which entries are new. Instead each ID that is checked against the seen_ids gets moved to the top of the list. IDs that are no longer used in the RSS Feed will end up at the bottom of the seen_ids list end will be removed. * remove uncommented code line * fix undefined method `count' for nil:NilClass, in case the first fetch of an rss feed failed or fetched empty rss feed. changed spec to verify the deletion of old ids by keeping all current ones, not by fixed limit * revert changes for dynamic cleanup. Instead made the limit of stored ids configurable. (key: max_ids) Default limit will remain 500. * fixed error message. replaced repeated function call with single call specifying amount. using options['max_ids'] instead of interpolated['max_ids'] to retrieve setting. * Rename to remembered_id_count and fix spec * Update rss_agent.rb fixed indentation --- app/models/agents/rss_agent.rb | 11 ++++++++++- spec/models/agents/rss_agent_spec.rb | 29 +++++++++++++++++++++++++++- 2 files changed, 38 insertions(+), 2 deletions(-) diff --git a/app/models/agents/rss_agent.rb b/app/models/agents/rss_agent.rb index 1741b726..8d779b6c 100644 --- a/app/models/agents/rss_agent.rb +++ b/app/models/agents/rss_agent.rb @@ -31,6 +31,7 @@ module Agents * `force_encoding` - Set `force_encoding` to an encoding name if the website is known to respond with a missing, invalid or wrong charset in the Content-Type header. Note that a text content without a charset is taken as encoded in UTF-8 (not ISO-8859-1). * `user_agent` - A custom User-Agent name (default: "Faraday v#{Faraday::VERSION}"). * `max_events_per_run` - Limit number of events created (items parsed) per run for feed. + * `remembered_id_count` - Number of IDs to keep track of and avoid re-emitting (default: 500). # Ordering Events @@ -133,6 +134,10 @@ module Agents errors.add(:base, "Please provide 'expected_update_period_in_days' to indicate how many days can pass without an update before this Agent is considered to not be working") end + if options['remembered_id_count'].present? && options['remembered_id_count'].to_i < 1 + errors.add(:base, "Please provide 'remembered_id_count' as a number bigger than 0 indicating how many IDs should be saved to distinguish between new and old IDs in RSS feeds. Delete option to use default (500).") + end + validate_web_request_options! validate_events_order end @@ -177,13 +182,17 @@ module Agents log "Fetched #{urls.to_sentence} and created #{events.size} event(s)." end + def remembered_id_count + (options['remembered_id_count'].presence || 500).to_i + end + def check_and_track(entry_id) memory['seen_ids'] ||= [] if memory['seen_ids'].include?(entry_id) false else memory['seen_ids'].unshift entry_id - memory['seen_ids'].pop if memory['seen_ids'].length > 500 + memory['seen_ids'].pop(memory['seen_ids'].length - remembered_id_count) if memory['seen_ids'].length > remembered_id_count true end end diff --git a/spec/models/agents/rss_agent_spec.rb b/spec/models/agents/rss_agent_spec.rb index bf312006..5a375401 100644 --- a/spec/models/agents/rss_agent_spec.rb +++ b/spec/models/agents/rss_agent_spec.rb @@ -176,11 +176,38 @@ describe Agents::RssAgent do expect(agent.memory['seen_ids'][0]).to eq(newest_id) end - it "should truncate the seen_ids in memory at 500 items" do + it "should truncate the seen_ids in memory at 500 items per default" do agent.memory['seen_ids'] = ['x'] * 490 agent.check expect(agent.memory['seen_ids'].length).to eq(500) end + + it "should truncate the seen_ids in memory at amount of items configured in options" do + agent.options['remembered_id_count'] = "600" + agent.memory['seen_ids'] = ['x'] * 590 + agent.check + expect(agent.memory['seen_ids'].length).to eq(600) + end + + it "should truncate the seen_ids after configuring a lower limit of items when check is executed" do + agent.memory['seen_ids'] = ['x'] * 600 + agent.options['remembered_id_count'] = "400" + expect(agent.memory['seen_ids'].length).to eq(600) + agent.check + expect(agent.memory['seen_ids'].length).to eq(400) + end + + it "should truncate the seen_ids at default after removing custom limit" do + agent.options['remembered_id_count'] = "600" + agent.memory['seen_ids'] = ['x'] * 590 + agent.check + expect(agent.memory['seen_ids'].length).to eq(600) + + agent.options.delete('remembered_id_count') + agent.memory['seen_ids'] = ['x'] * 590 + agent.check + expect(agent.memory['seen_ids'].length).to eq(500) + end it "should support an array of URLs" do agent.options['url'] = ["https://github.com/cantino/huginn/commits/master.atom", "http://feeds.feedburner.com/SlickdealsnetFP?format=atom"]