From 6a71295205bcb5abf2efe4b6037bf49babdb0694 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A9za=20B=C3=BAza?= Date: Tue, 25 Jul 2017 19:10:37 +0200 Subject: [PATCH] Improve documentation of Website Agent (#2066) Add sample JSON input and output events to clarify how JSONPath works. --- app/models/agents/website_agent.rb | 47 +++++++++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/app/models/agents/website_agent.rb b/app/models/agents/website_agent.rb index 97427832..e473a926 100644 --- a/app/models/agents/website_agent.rb +++ b/app/models/agents/website_agent.rb @@ -58,13 +58,58 @@ module Agents # Scraping JSON - When parsing JSON, these sub-hashes specify [JSONPaths](http://goessner.net/articles/JsonPath/) to the values that you care about. For example: + When parsing JSON, these sub-hashes specify [JSONPaths](http://goessner.net/articles/JsonPath/) to the values that you care about. + + Sample incoming event: + + { "results": { + "data": [ + { + "title": "Lorem ipsum 1", + "description": "Aliquam pharetra leo ipsum." + "price": 8.95 + }, + { + "title": "Lorem ipsum 2", + "description": "Suspendisse a pulvinar lacus." + "price": 12.99 + }, + { + "title": "Lorem ipsum 3", + "description": "Praesent ac arcu tellus." + "price": 8.99 + } + ] + } + } + + Sample rule: "extract": { "title": { "path": "results.data[*].title" }, "description": { "path": "results.data[*].description" } } + In this example the `*` wildcard character makes the parser to iterate through all items of the `data` array. Three events will be created as a result. + + Sample outgoing events: + + [ + { + "title": "Lorem ipsum 1", + "description": "Aliquam pharetra leo ipsum." + }, + { + "title": "Lorem ipsum 2", + "description": "Suspendisse a pulvinar lacus." + }, + { + "title": "Lorem ipsum 3", + "description": "Praesent ac arcu tellus." + } + ] + + The `extract` option can be skipped for the JSON type, causing the full JSON response to be returned. # Scraping Text