PhantomJs Cloud Agent (#1503)

* Initial draft of PhantomJsCloudAgent

Generates event with url for fetching html/plainText content

* Add options

* Pass in event instead of url
Fix hash syntax
Remove whitespace
Add mode merge

* Add some tests

* Style changes

- Add link to wiki entry for manually creating agent with full set of
options
This commit is contained in:
Irfan Charania 2016-12-20 09:44:54 -08:00 committed by Andrew Cantino
parent 2a524abff5
commit 4e2d1775a6
2 changed files with 279 additions and 0 deletions

View file

@ -0,0 +1,162 @@
require 'json'
require 'uri'
module Agents
class PhantomJsCloudAgent < Agent
include ERB::Util
include FormConfigurable
include WebRequestConcern
can_dry_run!
default_schedule 'every_12h'
description <<-MD
[PhantomJs Cloud](https://phantomjscloud.com/) renders webpages in much the same way as a browser would, and allows the Website Agent to properly scrape dynamic content from javascript-heavy pages.
The Phantom Js Cloud Agent is used to formulate a url in accordance with the [PhantomJs Cloud API](https://phantomjscloud.com/docs/index.html).
This url can then be supplied to Website Agent to fetch and parse content.
[Sign up](https://dashboard.phantomjscloud.com/dash.html#/signup) to get an api key, and add it in Huginn credentials.
Options:
* `Api key` - PhantomJs Cloud API Key credential stored in Huginn
* `Url` - The url to render
* `Mode` - Create a new `clean` event or `merge` old payload with new values (default: `clean`)
* `Render type` - Render as html or plain text without html tags (default: `html`)
* `Output as json` - Return the page conents and metadata as a JSON object (default: `false`)
* `Ignore images` - Skip loading of inlined images (default: `false`)
* `Url agent` - A custom User-Agent name (default: `#{default_user_agent}`)
* `Wait interval` - Milliseconds to delay rendering after the last resource is finished loading.
This is useful in case there are any AJAX requests or animations that need to finish up.
This can safely be set to 0 if you know there are no AJAX or animations you need to wait for (default: `1000`ms)
As this agent only provides a limited subset of the most commonly used options, you can follow [this guide](https://github.com/cantino/huginn/wiki/Browser-Emulation-Using-PhantomJS-Cloud) to make full use of additional options PhantomJsCloud provides.
MD
event_description <<-MD
Events look like this:
{
"url": "..."
}
MD
def default_options
{
'mode' => 'clean',
'url' => 'http://xkcd.com',
'render_type' => 'html',
'output_as_json' => false,
'ignore_images' => false,
'user_agent' => self.class.default_user_agent,
'wait_interval' => '1000'
}
end
form_configurable :mode, type: :array, values: ['clean', 'merge']
form_configurable :api_key, roles: :completable
form_configurable :url
form_configurable :render_type, type: :array, values: ['html', 'plainText']
form_configurable :output_as_json, type: :boolean
form_configurable :ignore_images, type: :boolean
form_configurable :user_agent, type: :text
form_configurable :wait_interval
def mode
interpolated['mode'].presence || default_options['mode']
end
def render_type
interpolated['render_type'].presence || default_options['render_type']
end
def output_as_json
boolify(interpolated['output_as_json'].presence ||
default_options['output_as_json'])
end
def ignore_images
boolify(interpolated['ignore_images'].presence ||
default_options['ignore_images'])
end
def user_agent
interpolated['user_agent'].presence || self.class.default_user_agent
end
def wait_interval
interpolated['wait_interval'].presence || default_options['wait_interval']
end
def page_request_settings
prs = {}
prs[:ignoreImages] = ignore_images if ignore_images
prs[:userAgent] = user_agent if user_agent.present?
if wait_interval != default_options['wait_interval']
prs[:wait_interval] = wait_interval
end
prs
end
def build_phantom_url(interpolated)
api_key = interpolated[:api_key]
page_request_hash = {
url: interpolated[:url],
renderType: render_type
}
page_request_hash[:outputAsJson] = output_as_json if output_as_json
page_request_settings_hash = page_request_settings
if page_request_settings_hash.any?
page_request_hash[:requestSettings] = page_request_settings_hash
end
request = page_request_hash.to_json
log "Generated request: #{request}"
encoded = url_encode(request)
"https://phantomjscloud.com/api/browser/v2/#{api_key}/?request=#{encoded}"
end
def check
phantom_url = build_phantom_url(interpolated)
create_event payload: { 'url' => phantom_url }
end
def receive(incoming_events)
incoming_events.each do |event|
interpolate_with(event) do
existing_payload = interpolated['mode'].to_s == 'merge' ? event.payload : {}
phantom_url = build_phantom_url(interpolated)
result = { 'url' => phantom_url }
create_event payload: existing_payload.merge(result)
end
end
end
def complete_api_key
user.user_credentials.map { |c| { text: c.credential_name, id: "{% credential #{c.credential_name} %}" } }
end
def working?
!recent_error_logs? || received_event_without_error?
end
def validate_options
# Check for required fields
errors.add(:base, 'Url is required') unless options['url'].present?
errors.add(:base, 'API key (credential) is required') unless options['api_key'].present?
end
end
end

View file

@ -0,0 +1,117 @@
require 'rails_helper'
describe Agents::PhantomJsCloudAgent do
before do
@valid_options = {
'name' => "XKCD",
'render_type' => "html",
'url' => "http://xkcd.com",
'mode' => 'clean',
'api_key' => '1234567890'
}
@checker = Agents::PhantomJsCloudAgent.new(:name => "xkcd", :options => @valid_options, :keep_events_for => 2.days)
@checker.user = users(:jane)
@checker.save!
end
describe "validations" do
before do
expect(@checker).to be_valid
end
it "should validate the presence of url" do
@checker.options['url'] = "http://google.com"
expect(@checker).to be_valid
@checker.options['url'] = ""
expect(@checker).not_to be_valid
@checker.options['url'] = nil
expect(@checker).not_to be_valid
end
end
describe "emitting event" do
it "should emit url as event" do
expect {
@checker.check
}.to change { @checker.events.count }.by(1)
item,* = @checker.events.last(1)
expect(item.payload['url']).to eq("https://phantomjscloud.com/api/browser/v2/1234567890/?request=%7B%22url%22%3A%22http%3A%2F%2Fxkcd.com%22%2C%22renderType%22%3A%22html%22%2C%22requestSettings%22%3A%7B%22userAgent%22%3A%22Huginn%20-%20https%3A%2F%2Fgithub.com%2Fcantino%2Fhuginn%22%7D%7D")
end
it "should set render type as plain text" do
@checker.options['render_type'] = 'plainText'
expect {
@checker.check
}.to change { @checker.events.count }.by(1)
item,* = @checker.events.last(1)
expect(item.payload['url']).to eq("https://phantomjscloud.com/api/browser/v2/1234567890/?request=%7B%22url%22%3A%22http%3A%2F%2Fxkcd.com%22%2C%22renderType%22%3A%22plainText%22%2C%22requestSettings%22%3A%7B%22userAgent%22%3A%22Huginn%20-%20https%3A%2F%2Fgithub.com%2Fcantino%2Fhuginn%22%7D%7D")
end
it "should set output as json" do
@checker.options['output_as_json'] = true
expect {
@checker.check
}.to change { @checker.events.count }.by(1)
item,* = @checker.events.last(1)
expect(item.payload['url']).to eq("https://phantomjscloud.com/api/browser/v2/1234567890/?request=%7B%22url%22%3A%22http%3A%2F%2Fxkcd.com%22%2C%22renderType%22%3A%22html%22%2C%22outputAsJson%22%3Atrue%2C%22requestSettings%22%3A%7B%22userAgent%22%3A%22Huginn%20-%20https%3A%2F%2Fgithub.com%2Fcantino%2Fhuginn%22%7D%7D")
end
it "should not set ignore images" do
@checker.options['ignore_images'] = false
expect {
@checker.check
}.to change { @checker.events.count }.by(1)
item,* = @checker.events.last(1)
expect(item.payload['url']).to eq("https://phantomjscloud.com/api/browser/v2/1234567890/?request=%7B%22url%22%3A%22http%3A%2F%2Fxkcd.com%22%2C%22renderType%22%3A%22html%22%2C%22requestSettings%22%3A%7B%22userAgent%22%3A%22Huginn%20-%20https%3A%2F%2Fgithub.com%2Fcantino%2Fhuginn%22%7D%7D")
end
it "should set ignore images" do
@checker.options['ignore_images'] = true
expect {
@checker.check
}.to change { @checker.events.count }.by(1)
item,* = @checker.events.last(1)
expect(item.payload['url']).to eq("https://phantomjscloud.com/api/browser/v2/1234567890/?request=%7B%22url%22%3A%22http%3A%2F%2Fxkcd.com%22%2C%22renderType%22%3A%22html%22%2C%22requestSettings%22%3A%7B%22ignoreImages%22%3Atrue%2C%22userAgent%22%3A%22Huginn%20-%20https%3A%2F%2Fgithub.com%2Fcantino%2Fhuginn%22%7D%7D")
end
it "should set wait interval to zero" do
@checker.options['wait_interval'] = '0'
expect {
@checker.check
}.to change { @checker.events.count }.by(1)
item,* = @checker.events.last(1)
expect(item.payload['url']).to eq("https://phantomjscloud.com/api/browser/v2/1234567890/?request=%7B%22url%22%3A%22http%3A%2F%2Fxkcd.com%22%2C%22renderType%22%3A%22html%22%2C%22requestSettings%22%3A%7B%22userAgent%22%3A%22Huginn%20-%20https%3A%2F%2Fgithub.com%2Fcantino%2Fhuginn%22%2C%22wait_interval%22%3A%220%22%7D%7D")
end
it "should set user agent to BlackBerry" do
@checker.options['user_agent'] = 'Mozilla/5.0 (BlackBerry; U; BlackBerry 9900; en) AppleWebKit/534.11+ (KHTML, like Gecko) Version/7.1.0.346 Mobile Safari/534.11+'
expect {
@checker.check
}.to change { @checker.events.count }.by(1)
item,* = @checker.events.last(1)
expect(item.payload['url']).to eq("https://phantomjscloud.com/api/browser/v2/1234567890/?request=%7B%22url%22%3A%22http%3A%2F%2Fxkcd.com%22%2C%22renderType%22%3A%22html%22%2C%22requestSettings%22%3A%7B%22userAgent%22%3A%22Mozilla%2F5.0%20%28BlackBerry%3B%20U%3B%20BlackBerry%209900%3B%20en%29%20AppleWebKit%2F534.11%2B%20%28KHTML%2C%20like%20Gecko%29%20Version%2F7.1.0.346%20Mobile%20Safari%2F534.11%2B%22%7D%7D")
end
end
end