mirror of
https://github.com/Fishwaldo/huginn.git
synced 2025-03-15 19:31:26 +00:00
PhantomJs Cloud Agent (#1503)
* Initial draft of PhantomJsCloudAgent Generates event with url for fetching html/plainText content * Add options * Pass in event instead of url Fix hash syntax Remove whitespace Add mode merge * Add some tests * Style changes - Add link to wiki entry for manually creating agent with full set of options
This commit is contained in:
parent
2a524abff5
commit
4e2d1775a6
2 changed files with 279 additions and 0 deletions
162
app/models/agents/phantom_js_cloud_agent.rb
Normal file
162
app/models/agents/phantom_js_cloud_agent.rb
Normal file
|
@ -0,0 +1,162 @@
|
|||
require 'json'
|
||||
require 'uri'
|
||||
|
||||
module Agents
|
||||
class PhantomJsCloudAgent < Agent
|
||||
include ERB::Util
|
||||
include FormConfigurable
|
||||
include WebRequestConcern
|
||||
|
||||
can_dry_run!
|
||||
|
||||
default_schedule 'every_12h'
|
||||
|
||||
description <<-MD
|
||||
[PhantomJs Cloud](https://phantomjscloud.com/) renders webpages in much the same way as a browser would, and allows the Website Agent to properly scrape dynamic content from javascript-heavy pages.
|
||||
|
||||
The Phantom Js Cloud Agent is used to formulate a url in accordance with the [PhantomJs Cloud API](https://phantomjscloud.com/docs/index.html).
|
||||
This url can then be supplied to Website Agent to fetch and parse content.
|
||||
|
||||
[Sign up](https://dashboard.phantomjscloud.com/dash.html#/signup) to get an api key, and add it in Huginn credentials.
|
||||
|
||||
|
||||
Options:
|
||||
|
||||
* `Api key` - PhantomJs Cloud API Key credential stored in Huginn
|
||||
* `Url` - The url to render
|
||||
* `Mode` - Create a new `clean` event or `merge` old payload with new values (default: `clean`)
|
||||
* `Render type` - Render as html or plain text without html tags (default: `html`)
|
||||
* `Output as json` - Return the page conents and metadata as a JSON object (default: `false`)
|
||||
* `Ignore images` - Skip loading of inlined images (default: `false`)
|
||||
* `Url agent` - A custom User-Agent name (default: `#{default_user_agent}`)
|
||||
* `Wait interval` - Milliseconds to delay rendering after the last resource is finished loading.
|
||||
This is useful in case there are any AJAX requests or animations that need to finish up.
|
||||
This can safely be set to 0 if you know there are no AJAX or animations you need to wait for (default: `1000`ms)
|
||||
|
||||
|
||||
As this agent only provides a limited subset of the most commonly used options, you can follow [this guide](https://github.com/cantino/huginn/wiki/Browser-Emulation-Using-PhantomJS-Cloud) to make full use of additional options PhantomJsCloud provides.
|
||||
|
||||
MD
|
||||
|
||||
event_description <<-MD
|
||||
Events look like this:
|
||||
{
|
||||
"url": "..."
|
||||
}
|
||||
MD
|
||||
|
||||
def default_options
|
||||
{
|
||||
'mode' => 'clean',
|
||||
'url' => 'http://xkcd.com',
|
||||
'render_type' => 'html',
|
||||
'output_as_json' => false,
|
||||
'ignore_images' => false,
|
||||
'user_agent' => self.class.default_user_agent,
|
||||
'wait_interval' => '1000'
|
||||
}
|
||||
end
|
||||
|
||||
form_configurable :mode, type: :array, values: ['clean', 'merge']
|
||||
form_configurable :api_key, roles: :completable
|
||||
form_configurable :url
|
||||
form_configurable :render_type, type: :array, values: ['html', 'plainText']
|
||||
form_configurable :output_as_json, type: :boolean
|
||||
form_configurable :ignore_images, type: :boolean
|
||||
form_configurable :user_agent, type: :text
|
||||
form_configurable :wait_interval
|
||||
|
||||
def mode
|
||||
interpolated['mode'].presence || default_options['mode']
|
||||
end
|
||||
|
||||
def render_type
|
||||
interpolated['render_type'].presence || default_options['render_type']
|
||||
end
|
||||
|
||||
def output_as_json
|
||||
boolify(interpolated['output_as_json'].presence ||
|
||||
default_options['output_as_json'])
|
||||
end
|
||||
|
||||
def ignore_images
|
||||
boolify(interpolated['ignore_images'].presence ||
|
||||
default_options['ignore_images'])
|
||||
end
|
||||
|
||||
def user_agent
|
||||
interpolated['user_agent'].presence || self.class.default_user_agent
|
||||
end
|
||||
|
||||
def wait_interval
|
||||
interpolated['wait_interval'].presence || default_options['wait_interval']
|
||||
end
|
||||
|
||||
def page_request_settings
|
||||
prs = {}
|
||||
|
||||
prs[:ignoreImages] = ignore_images if ignore_images
|
||||
prs[:userAgent] = user_agent if user_agent.present?
|
||||
|
||||
if wait_interval != default_options['wait_interval']
|
||||
prs[:wait_interval] = wait_interval
|
||||
end
|
||||
|
||||
prs
|
||||
end
|
||||
|
||||
def build_phantom_url(interpolated)
|
||||
api_key = interpolated[:api_key]
|
||||
page_request_hash = {
|
||||
url: interpolated[:url],
|
||||
renderType: render_type
|
||||
}
|
||||
|
||||
page_request_hash[:outputAsJson] = output_as_json if output_as_json
|
||||
|
||||
page_request_settings_hash = page_request_settings
|
||||
|
||||
if page_request_settings_hash.any?
|
||||
page_request_hash[:requestSettings] = page_request_settings_hash
|
||||
end
|
||||
|
||||
request = page_request_hash.to_json
|
||||
log "Generated request: #{request}"
|
||||
|
||||
encoded = url_encode(request)
|
||||
"https://phantomjscloud.com/api/browser/v2/#{api_key}/?request=#{encoded}"
|
||||
end
|
||||
|
||||
def check
|
||||
phantom_url = build_phantom_url(interpolated)
|
||||
|
||||
create_event payload: { 'url' => phantom_url }
|
||||
end
|
||||
|
||||
def receive(incoming_events)
|
||||
incoming_events.each do |event|
|
||||
interpolate_with(event) do
|
||||
existing_payload = interpolated['mode'].to_s == 'merge' ? event.payload : {}
|
||||
phantom_url = build_phantom_url(interpolated)
|
||||
|
||||
result = { 'url' => phantom_url }
|
||||
create_event payload: existing_payload.merge(result)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
def complete_api_key
|
||||
user.user_credentials.map { |c| { text: c.credential_name, id: "{% credential #{c.credential_name} %}" } }
|
||||
end
|
||||
|
||||
def working?
|
||||
!recent_error_logs? || received_event_without_error?
|
||||
end
|
||||
|
||||
def validate_options
|
||||
# Check for required fields
|
||||
errors.add(:base, 'Url is required') unless options['url'].present?
|
||||
errors.add(:base, 'API key (credential) is required') unless options['api_key'].present?
|
||||
end
|
||||
end
|
||||
end
|
117
spec/models/agents/phantom_js_cloud_agent_spec.rb
Normal file
117
spec/models/agents/phantom_js_cloud_agent_spec.rb
Normal file
|
@ -0,0 +1,117 @@
|
|||
require 'rails_helper'
|
||||
|
||||
describe Agents::PhantomJsCloudAgent do
|
||||
before do
|
||||
|
||||
@valid_options = {
|
||||
'name' => "XKCD",
|
||||
'render_type' => "html",
|
||||
'url' => "http://xkcd.com",
|
||||
'mode' => 'clean',
|
||||
'api_key' => '1234567890'
|
||||
}
|
||||
|
||||
@checker = Agents::PhantomJsCloudAgent.new(:name => "xkcd", :options => @valid_options, :keep_events_for => 2.days)
|
||||
@checker.user = users(:jane)
|
||||
@checker.save!
|
||||
end
|
||||
|
||||
describe "validations" do
|
||||
before do
|
||||
expect(@checker).to be_valid
|
||||
end
|
||||
|
||||
it "should validate the presence of url" do
|
||||
@checker.options['url'] = "http://google.com"
|
||||
expect(@checker).to be_valid
|
||||
|
||||
@checker.options['url'] = ""
|
||||
expect(@checker).not_to be_valid
|
||||
|
||||
@checker.options['url'] = nil
|
||||
expect(@checker).not_to be_valid
|
||||
end
|
||||
|
||||
end
|
||||
|
||||
describe "emitting event" do
|
||||
it "should emit url as event" do
|
||||
expect {
|
||||
@checker.check
|
||||
}.to change { @checker.events.count }.by(1)
|
||||
|
||||
item,* = @checker.events.last(1)
|
||||
expect(item.payload['url']).to eq("https://phantomjscloud.com/api/browser/v2/1234567890/?request=%7B%22url%22%3A%22http%3A%2F%2Fxkcd.com%22%2C%22renderType%22%3A%22html%22%2C%22requestSettings%22%3A%7B%22userAgent%22%3A%22Huginn%20-%20https%3A%2F%2Fgithub.com%2Fcantino%2Fhuginn%22%7D%7D")
|
||||
end
|
||||
|
||||
it "should set render type as plain text" do
|
||||
@checker.options['render_type'] = 'plainText'
|
||||
|
||||
expect {
|
||||
@checker.check
|
||||
}.to change { @checker.events.count }.by(1)
|
||||
|
||||
item,* = @checker.events.last(1)
|
||||
expect(item.payload['url']).to eq("https://phantomjscloud.com/api/browser/v2/1234567890/?request=%7B%22url%22%3A%22http%3A%2F%2Fxkcd.com%22%2C%22renderType%22%3A%22plainText%22%2C%22requestSettings%22%3A%7B%22userAgent%22%3A%22Huginn%20-%20https%3A%2F%2Fgithub.com%2Fcantino%2Fhuginn%22%7D%7D")
|
||||
end
|
||||
|
||||
it "should set output as json" do
|
||||
@checker.options['output_as_json'] = true
|
||||
|
||||
expect {
|
||||
@checker.check
|
||||
}.to change { @checker.events.count }.by(1)
|
||||
|
||||
item,* = @checker.events.last(1)
|
||||
expect(item.payload['url']).to eq("https://phantomjscloud.com/api/browser/v2/1234567890/?request=%7B%22url%22%3A%22http%3A%2F%2Fxkcd.com%22%2C%22renderType%22%3A%22html%22%2C%22outputAsJson%22%3Atrue%2C%22requestSettings%22%3A%7B%22userAgent%22%3A%22Huginn%20-%20https%3A%2F%2Fgithub.com%2Fcantino%2Fhuginn%22%7D%7D")
|
||||
end
|
||||
|
||||
it "should not set ignore images" do
|
||||
@checker.options['ignore_images'] = false
|
||||
|
||||
expect {
|
||||
@checker.check
|
||||
}.to change { @checker.events.count }.by(1)
|
||||
|
||||
item,* = @checker.events.last(1)
|
||||
expect(item.payload['url']).to eq("https://phantomjscloud.com/api/browser/v2/1234567890/?request=%7B%22url%22%3A%22http%3A%2F%2Fxkcd.com%22%2C%22renderType%22%3A%22html%22%2C%22requestSettings%22%3A%7B%22userAgent%22%3A%22Huginn%20-%20https%3A%2F%2Fgithub.com%2Fcantino%2Fhuginn%22%7D%7D")
|
||||
end
|
||||
|
||||
it "should set ignore images" do
|
||||
@checker.options['ignore_images'] = true
|
||||
|
||||
expect {
|
||||
@checker.check
|
||||
}.to change { @checker.events.count }.by(1)
|
||||
|
||||
item,* = @checker.events.last(1)
|
||||
expect(item.payload['url']).to eq("https://phantomjscloud.com/api/browser/v2/1234567890/?request=%7B%22url%22%3A%22http%3A%2F%2Fxkcd.com%22%2C%22renderType%22%3A%22html%22%2C%22requestSettings%22%3A%7B%22ignoreImages%22%3Atrue%2C%22userAgent%22%3A%22Huginn%20-%20https%3A%2F%2Fgithub.com%2Fcantino%2Fhuginn%22%7D%7D")
|
||||
end
|
||||
|
||||
it "should set wait interval to zero" do
|
||||
@checker.options['wait_interval'] = '0'
|
||||
|
||||
expect {
|
||||
@checker.check
|
||||
}.to change { @checker.events.count }.by(1)
|
||||
|
||||
item,* = @checker.events.last(1)
|
||||
expect(item.payload['url']).to eq("https://phantomjscloud.com/api/browser/v2/1234567890/?request=%7B%22url%22%3A%22http%3A%2F%2Fxkcd.com%22%2C%22renderType%22%3A%22html%22%2C%22requestSettings%22%3A%7B%22userAgent%22%3A%22Huginn%20-%20https%3A%2F%2Fgithub.com%2Fcantino%2Fhuginn%22%2C%22wait_interval%22%3A%220%22%7D%7D")
|
||||
end
|
||||
|
||||
it "should set user agent to BlackBerry" do
|
||||
@checker.options['user_agent'] = 'Mozilla/5.0 (BlackBerry; U; BlackBerry 9900; en) AppleWebKit/534.11+ (KHTML, like Gecko) Version/7.1.0.346 Mobile Safari/534.11+'
|
||||
|
||||
expect {
|
||||
@checker.check
|
||||
}.to change { @checker.events.count }.by(1)
|
||||
|
||||
item,* = @checker.events.last(1)
|
||||
expect(item.payload['url']).to eq("https://phantomjscloud.com/api/browser/v2/1234567890/?request=%7B%22url%22%3A%22http%3A%2F%2Fxkcd.com%22%2C%22renderType%22%3A%22html%22%2C%22requestSettings%22%3A%7B%22userAgent%22%3A%22Mozilla%2F5.0%20%28BlackBerry%3B%20U%3B%20BlackBerry%209900%3B%20en%29%20AppleWebKit%2F534.11%2B%20%28KHTML%2C%20like%20Gecko%29%20Version%2F7.1.0.346%20Mobile%20Safari%2F534.11%2B%22%7D%7D")
|
||||
end
|
||||
|
||||
|
||||
|
||||
end
|
||||
|
||||
end
|
Loading…
Add table
Reference in a new issue