Rss agent dynamic cleanup (#1733)

* Remove fixed limit of 500 IDs used to figure out which entries are new.
Instead each ID that is checked against the seen_ids gets moved to the top of the list.
IDs that are no longer used in the RSS Feed will end up at the bottom of the seen_ids list end will be removed.

* remove uncommented code line

* fix undefined method `count' for nil:NilClass, in case the first fetch of an rss feed failed or fetched empty rss feed.

changed spec to verify the deletion of old ids by keeping all current ones, not by fixed limit

* revert changes for dynamic cleanup.
Instead made the limit of stored ids configurable. (key: max_ids)
Default limit will remain 500.

* fixed error message.
replaced repeated function call with single call specifying amount.
using options['max_ids'] instead of interpolated['max_ids'] to retrieve setting.

* Rename to remembered_id_count and fix spec

* Update rss_agent.rb

fixed indentation
This commit is contained in:
Warloxx 2017-04-24 00:44:03 +02:00 committed by Andrew Cantino
parent 81324ebad4
commit e3f79bf84b
2 changed files with 38 additions and 2 deletions

View file

@ -31,6 +31,7 @@ module Agents
* `force_encoding` - Set `force_encoding` to an encoding name if the website is known to respond with a missing, invalid or wrong charset in the Content-Type header. Note that a text content without a charset is taken as encoded in UTF-8 (not ISO-8859-1).
* `user_agent` - A custom User-Agent name (default: "Faraday v#{Faraday::VERSION}").
* `max_events_per_run` - Limit number of events created (items parsed) per run for feed.
* `remembered_id_count` - Number of IDs to keep track of and avoid re-emitting (default: 500).
# Ordering Events
@ -133,6 +134,10 @@ module Agents
errors.add(:base, "Please provide 'expected_update_period_in_days' to indicate how many days can pass without an update before this Agent is considered to not be working")
end
if options['remembered_id_count'].present? && options['remembered_id_count'].to_i < 1
errors.add(:base, "Please provide 'remembered_id_count' as a number bigger than 0 indicating how many IDs should be saved to distinguish between new and old IDs in RSS feeds. Delete option to use default (500).")
end
validate_web_request_options!
validate_events_order
end
@ -177,13 +182,17 @@ module Agents
log "Fetched #{urls.to_sentence} and created #{events.size} event(s)."
end
def remembered_id_count
(options['remembered_id_count'].presence || 500).to_i
end
def check_and_track(entry_id)
memory['seen_ids'] ||= []
if memory['seen_ids'].include?(entry_id)
false
else
memory['seen_ids'].unshift entry_id
memory['seen_ids'].pop if memory['seen_ids'].length > 500
memory['seen_ids'].pop(memory['seen_ids'].length - remembered_id_count) if memory['seen_ids'].length > remembered_id_count
true
end
end

View file

@ -176,11 +176,38 @@ describe Agents::RssAgent do
expect(agent.memory['seen_ids'][0]).to eq(newest_id)
end
it "should truncate the seen_ids in memory at 500 items" do
it "should truncate the seen_ids in memory at 500 items per default" do
agent.memory['seen_ids'] = ['x'] * 490
agent.check
expect(agent.memory['seen_ids'].length).to eq(500)
end
it "should truncate the seen_ids in memory at amount of items configured in options" do
agent.options['remembered_id_count'] = "600"
agent.memory['seen_ids'] = ['x'] * 590
agent.check
expect(agent.memory['seen_ids'].length).to eq(600)
end
it "should truncate the seen_ids after configuring a lower limit of items when check is executed" do
agent.memory['seen_ids'] = ['x'] * 600
agent.options['remembered_id_count'] = "400"
expect(agent.memory['seen_ids'].length).to eq(600)
agent.check
expect(agent.memory['seen_ids'].length).to eq(400)
end
it "should truncate the seen_ids at default after removing custom limit" do
agent.options['remembered_id_count'] = "600"
agent.memory['seen_ids'] = ['x'] * 590
agent.check
expect(agent.memory['seen_ids'].length).to eq(600)
agent.options.delete('remembered_id_count')
agent.memory['seen_ids'] = ['x'] * 590
agent.check
expect(agent.memory['seen_ids'].length).to eq(500)
end
it "should support an array of URLs" do
agent.options['url'] = ["https://github.com/cantino/huginn/commits/master.atom", "http://feeds.feedburner.com/SlickdealsnetFP?format=atom"]