From 4e2d1775a6f1fc9ae8f6c7a6fe212813b8d7b5df Mon Sep 17 00:00:00 2001 From: Irfan Charania Date: Tue, 20 Dec 2016 09:44:54 -0800 Subject: [PATCH] PhantomJs Cloud Agent (#1503) * Initial draft of PhantomJsCloudAgent Generates event with url for fetching html/plainText content * Add options * Pass in event instead of url Fix hash syntax Remove whitespace Add mode merge * Add some tests * Style changes - Add link to wiki entry for manually creating agent with full set of options --- app/models/agents/phantom_js_cloud_agent.rb | 162 ++++++++++++++++++ .../agents/phantom_js_cloud_agent_spec.rb | 117 +++++++++++++ 2 files changed, 279 insertions(+) create mode 100644 app/models/agents/phantom_js_cloud_agent.rb create mode 100644 spec/models/agents/phantom_js_cloud_agent_spec.rb diff --git a/app/models/agents/phantom_js_cloud_agent.rb b/app/models/agents/phantom_js_cloud_agent.rb new file mode 100644 index 00000000..f0c9e605 --- /dev/null +++ b/app/models/agents/phantom_js_cloud_agent.rb @@ -0,0 +1,162 @@ +require 'json' +require 'uri' + +module Agents + class PhantomJsCloudAgent < Agent + include ERB::Util + include FormConfigurable + include WebRequestConcern + + can_dry_run! + + default_schedule 'every_12h' + + description <<-MD + [PhantomJs Cloud](https://phantomjscloud.com/) renders webpages in much the same way as a browser would, and allows the Website Agent to properly scrape dynamic content from javascript-heavy pages. + + The Phantom Js Cloud Agent is used to formulate a url in accordance with the [PhantomJs Cloud API](https://phantomjscloud.com/docs/index.html). + This url can then be supplied to Website Agent to fetch and parse content. + + [Sign up](https://dashboard.phantomjscloud.com/dash.html#/signup) to get an api key, and add it in Huginn credentials. + + + Options: + + * `Api key` - PhantomJs Cloud API Key credential stored in Huginn + * `Url` - The url to render + * `Mode` - Create a new `clean` event or `merge` old payload with new values (default: `clean`) + * `Render type` - Render as html or plain text without html tags (default: `html`) + * `Output as json` - Return the page conents and metadata as a JSON object (default: `false`) + * `Ignore images` - Skip loading of inlined images (default: `false`) + * `Url agent` - A custom User-Agent name (default: `#{default_user_agent}`) + * `Wait interval` - Milliseconds to delay rendering after the last resource is finished loading. + This is useful in case there are any AJAX requests or animations that need to finish up. + This can safely be set to 0 if you know there are no AJAX or animations you need to wait for (default: `1000`ms) + + + As this agent only provides a limited subset of the most commonly used options, you can follow [this guide](https://github.com/cantino/huginn/wiki/Browser-Emulation-Using-PhantomJS-Cloud) to make full use of additional options PhantomJsCloud provides. + + MD + + event_description <<-MD + Events look like this: + { + "url": "..." + } + MD + + def default_options + { + 'mode' => 'clean', + 'url' => 'http://xkcd.com', + 'render_type' => 'html', + 'output_as_json' => false, + 'ignore_images' => false, + 'user_agent' => self.class.default_user_agent, + 'wait_interval' => '1000' + } + end + + form_configurable :mode, type: :array, values: ['clean', 'merge'] + form_configurable :api_key, roles: :completable + form_configurable :url + form_configurable :render_type, type: :array, values: ['html', 'plainText'] + form_configurable :output_as_json, type: :boolean + form_configurable :ignore_images, type: :boolean + form_configurable :user_agent, type: :text + form_configurable :wait_interval + + def mode + interpolated['mode'].presence || default_options['mode'] + end + + def render_type + interpolated['render_type'].presence || default_options['render_type'] + end + + def output_as_json + boolify(interpolated['output_as_json'].presence || + default_options['output_as_json']) + end + + def ignore_images + boolify(interpolated['ignore_images'].presence || + default_options['ignore_images']) + end + + def user_agent + interpolated['user_agent'].presence || self.class.default_user_agent + end + + def wait_interval + interpolated['wait_interval'].presence || default_options['wait_interval'] + end + + def page_request_settings + prs = {} + + prs[:ignoreImages] = ignore_images if ignore_images + prs[:userAgent] = user_agent if user_agent.present? + + if wait_interval != default_options['wait_interval'] + prs[:wait_interval] = wait_interval + end + + prs + end + + def build_phantom_url(interpolated) + api_key = interpolated[:api_key] + page_request_hash = { + url: interpolated[:url], + renderType: render_type + } + + page_request_hash[:outputAsJson] = output_as_json if output_as_json + + page_request_settings_hash = page_request_settings + + if page_request_settings_hash.any? + page_request_hash[:requestSettings] = page_request_settings_hash + end + + request = page_request_hash.to_json + log "Generated request: #{request}" + + encoded = url_encode(request) + "https://phantomjscloud.com/api/browser/v2/#{api_key}/?request=#{encoded}" + end + + def check + phantom_url = build_phantom_url(interpolated) + + create_event payload: { 'url' => phantom_url } + end + + def receive(incoming_events) + incoming_events.each do |event| + interpolate_with(event) do + existing_payload = interpolated['mode'].to_s == 'merge' ? event.payload : {} + phantom_url = build_phantom_url(interpolated) + + result = { 'url' => phantom_url } + create_event payload: existing_payload.merge(result) + end + end + end + + def complete_api_key + user.user_credentials.map { |c| { text: c.credential_name, id: "{% credential #{c.credential_name} %}" } } + end + + def working? + !recent_error_logs? || received_event_without_error? + end + + def validate_options + # Check for required fields + errors.add(:base, 'Url is required') unless options['url'].present? + errors.add(:base, 'API key (credential) is required') unless options['api_key'].present? + end + end +end diff --git a/spec/models/agents/phantom_js_cloud_agent_spec.rb b/spec/models/agents/phantom_js_cloud_agent_spec.rb new file mode 100644 index 00000000..07a79c2b --- /dev/null +++ b/spec/models/agents/phantom_js_cloud_agent_spec.rb @@ -0,0 +1,117 @@ +require 'rails_helper' + +describe Agents::PhantomJsCloudAgent do + before do + + @valid_options = { + 'name' => "XKCD", + 'render_type' => "html", + 'url' => "http://xkcd.com", + 'mode' => 'clean', + 'api_key' => '1234567890' + } + + @checker = Agents::PhantomJsCloudAgent.new(:name => "xkcd", :options => @valid_options, :keep_events_for => 2.days) + @checker.user = users(:jane) + @checker.save! + end + + describe "validations" do + before do + expect(@checker).to be_valid + end + + it "should validate the presence of url" do + @checker.options['url'] = "http://google.com" + expect(@checker).to be_valid + + @checker.options['url'] = "" + expect(@checker).not_to be_valid + + @checker.options['url'] = nil + expect(@checker).not_to be_valid + end + + end + + describe "emitting event" do + it "should emit url as event" do + expect { + @checker.check + }.to change { @checker.events.count }.by(1) + + item,* = @checker.events.last(1) + expect(item.payload['url']).to eq("https://phantomjscloud.com/api/browser/v2/1234567890/?request=%7B%22url%22%3A%22http%3A%2F%2Fxkcd.com%22%2C%22renderType%22%3A%22html%22%2C%22requestSettings%22%3A%7B%22userAgent%22%3A%22Huginn%20-%20https%3A%2F%2Fgithub.com%2Fcantino%2Fhuginn%22%7D%7D") + end + + it "should set render type as plain text" do + @checker.options['render_type'] = 'plainText' + + expect { + @checker.check + }.to change { @checker.events.count }.by(1) + + item,* = @checker.events.last(1) + expect(item.payload['url']).to eq("https://phantomjscloud.com/api/browser/v2/1234567890/?request=%7B%22url%22%3A%22http%3A%2F%2Fxkcd.com%22%2C%22renderType%22%3A%22plainText%22%2C%22requestSettings%22%3A%7B%22userAgent%22%3A%22Huginn%20-%20https%3A%2F%2Fgithub.com%2Fcantino%2Fhuginn%22%7D%7D") + end + + it "should set output as json" do + @checker.options['output_as_json'] = true + + expect { + @checker.check + }.to change { @checker.events.count }.by(1) + + item,* = @checker.events.last(1) + expect(item.payload['url']).to eq("https://phantomjscloud.com/api/browser/v2/1234567890/?request=%7B%22url%22%3A%22http%3A%2F%2Fxkcd.com%22%2C%22renderType%22%3A%22html%22%2C%22outputAsJson%22%3Atrue%2C%22requestSettings%22%3A%7B%22userAgent%22%3A%22Huginn%20-%20https%3A%2F%2Fgithub.com%2Fcantino%2Fhuginn%22%7D%7D") + end + + it "should not set ignore images" do + @checker.options['ignore_images'] = false + + expect { + @checker.check + }.to change { @checker.events.count }.by(1) + + item,* = @checker.events.last(1) + expect(item.payload['url']).to eq("https://phantomjscloud.com/api/browser/v2/1234567890/?request=%7B%22url%22%3A%22http%3A%2F%2Fxkcd.com%22%2C%22renderType%22%3A%22html%22%2C%22requestSettings%22%3A%7B%22userAgent%22%3A%22Huginn%20-%20https%3A%2F%2Fgithub.com%2Fcantino%2Fhuginn%22%7D%7D") + end + + it "should set ignore images" do + @checker.options['ignore_images'] = true + + expect { + @checker.check + }.to change { @checker.events.count }.by(1) + + item,* = @checker.events.last(1) + expect(item.payload['url']).to eq("https://phantomjscloud.com/api/browser/v2/1234567890/?request=%7B%22url%22%3A%22http%3A%2F%2Fxkcd.com%22%2C%22renderType%22%3A%22html%22%2C%22requestSettings%22%3A%7B%22ignoreImages%22%3Atrue%2C%22userAgent%22%3A%22Huginn%20-%20https%3A%2F%2Fgithub.com%2Fcantino%2Fhuginn%22%7D%7D") + end + + it "should set wait interval to zero" do + @checker.options['wait_interval'] = '0' + + expect { + @checker.check + }.to change { @checker.events.count }.by(1) + + item,* = @checker.events.last(1) + expect(item.payload['url']).to eq("https://phantomjscloud.com/api/browser/v2/1234567890/?request=%7B%22url%22%3A%22http%3A%2F%2Fxkcd.com%22%2C%22renderType%22%3A%22html%22%2C%22requestSettings%22%3A%7B%22userAgent%22%3A%22Huginn%20-%20https%3A%2F%2Fgithub.com%2Fcantino%2Fhuginn%22%2C%22wait_interval%22%3A%220%22%7D%7D") + end + + it "should set user agent to BlackBerry" do + @checker.options['user_agent'] = 'Mozilla/5.0 (BlackBerry; U; BlackBerry 9900; en) AppleWebKit/534.11+ (KHTML, like Gecko) Version/7.1.0.346 Mobile Safari/534.11+' + + expect { + @checker.check + }.to change { @checker.events.count }.by(1) + + item,* = @checker.events.last(1) + expect(item.payload['url']).to eq("https://phantomjscloud.com/api/browser/v2/1234567890/?request=%7B%22url%22%3A%22http%3A%2F%2Fxkcd.com%22%2C%22renderType%22%3A%22html%22%2C%22requestSettings%22%3A%7B%22userAgent%22%3A%22Mozilla%2F5.0%20%28BlackBerry%3B%20U%3B%20BlackBerry%209900%3B%20en%29%20AppleWebKit%2F534.11%2B%20%28KHTML%2C%20like%20Gecko%29%20Version%2F7.1.0.346%20Mobile%20Safari%2F534.11%2B%22%7D%7D") + end + + + + end + +end