Documentation

API

Web Scraper Cloud can be managed via an HTTPS JSON API. The API allows to manage sitemaps, scraping jobs and download data. Use our PHP SDK when developing you application in PHP.

API access token can be found in Web Scraper Cloud API page.

API call limit

By default, each user has a limit of 200 API calls per 15 minutes. Limit can be tracked by API call response headers:

X-RateLimit-Limit: 200
X-RateLimit-Remaining: 199
X-RateLimit-Reset: 1609372800   // returned only when limit is reached

If more API calls are required, please contact support.

Scraping job status

Scraping job can have one of these statuses:

  • waiting-to-be-scheduled - the scraping job is waiting in a queue to be scraped;
  • scheduled - the scraping job is waiting for the scraper server and will start in a moment;
  • started - the scraping job is in motion;
  • failed - the website returned more than 50% 4xx or 50xx responses or there were network errors, which means that job execution was stopped and scraping job marked as failed; however, the user can continue it manually;
  • finished - the scraping job has completed successfully without any failed or empty pages;
  • shelved - the scraping job has been moved to cold storage, meaning that either it stopped and then was moved to cold storage or it finished with empty or failed pages. This status will be removed in a future release;
  • stopped - the scraping job has been stopped manually by a user and will change its status to "shelved" after 2 weeks.

API calls

Create Sitemap

// Execute POST request to https://api.webscraper.io/api/v1/sitemap?api_token=<YOUR API TOKEN>
{
	"_id": "webscraper-io-landing",
	"startUrl": [
		"http://webscraper.io/"
	],
	"selectors": [
		{
			"parentSelectors": [
				"_root"
			],
			"type": "SelectorText",
			"multiple": false,
			"id": "title",
			"selector": "h1",
			"regex": "",
			"delay": ""
		}
	]
}


// response
{
	"success": true,
	"data": {
		"id": 123
	}
}
<?php
$sitemapJSON = '
{
	"_id": "webscraper-io-landing",
	"startUrl": [
		"http://webscraper.io/"
	],
	"selectors": [
		{
			"parentSelectors": [
				"_root"
			],
			"type": "SelectorText",
			"multiple": false,
			"id": "title",
			"selector": "h1",
			"regex": "",
			"delay": ""
		}
	]
}
';

$sitemap = json_decode($sitemapJSON, true);
$response = $client->createSitemap($sitemap);

Get Sitemap

// Execute GET request to https://api.webscraper.io/api/v1/sitemap/<Sitemap ID>?api_token=<YOUR API TOKEN>

// response
{
	"success": true,
	"data": {
		"id": 123
		"name": "webscraper-io-landing",
		"sitemap": "{\"_id\": \"webscraper-io-landing\", ...}",
	}
}
<?php
$sitemap = $client->getSitemap($sitemapId);

//output:
[
	'id' => 123,
	'name' => 'webscraper-io-landing',
	'sitemap' => '{"_id": "webscraper-io-landing", ...}',
];

Get Sitemaps

// Execute GET request to https://api.webscraper.io/api/v1/sitemaps?api_token=<YOUR API TOKEN>

// response
{
	"success": true,
	"data": [
		{
			"id": 123
			"name": "webscraper-io-landing",
		},
		{
			"id": 123
			"name": "webscraper-io-landing2",
		}
	],
	"current_page": 1,
	"last_page": 1,
	"total": 2,
	"per_page": 100,
}

// Execute GET request to get second page https://api.webscraper.io/api/v1/sitemaps?api_token=<YOUR API TOKEN>&page=2
<?php
$sitemaps = $client->getSitemaps();

// response (Iterator):
[
	[
		'id' => 123,
		'name' => 'webscraper-io-landing',
	],
];

// iterate through all sitemaps
$sitemaps = $client->getSitemaps();
foreach($sitemaps as $sitemap) {
	var_dump($sitemap);
}

// iterate throuh all sitemaps while manually handling pagination
$iterator = $client->getSitemaps();
$page = 1;
do {
	$sitemaps = $iterator->getPageData($page);
	foreach($sitemaps as $sitemap) {
		var_dump($sitemap);
	}
	$page++;
} while($page <= $iterator->getLastPage());

Delete Sitemap

// Execute DELETE request to https://api.webscraper.io/api/v1/sitemap/<Sitemap ID>?api_token=<YOUR API TOKEN>

// response
{
	"success": true,
	"data": "ok"
}
<?php
$client->deleteSitemap(123);

Create Scraping Job

// Execute POST request to https://api.webscraper.io/api/v1/scraping-job?api_token=<YOUR API TOKEN>
{
	"sitemap_id": 123,
	"driver": "fast", // "fast" or "fulljs"
	"page_load_delay": 2000,
	"request_interval": 2000,
	"proxy": 0 // optional. 0 - no proxy, 1 - use proxy. Or proxy id for Scale plan users
}

// response
{
	"success": true,
	"data": {
		"id": 500
	}
}
<?php
$response = $client->createScrapingJob([
	'sitemap_id' => 123,
	'driver' => 'fast', // 'fast' or 'fulljs'
	'page_load_delay' => 2000,
	'request_interval' => 2000,
	'proxy' => 0, // optional. 0 - no proxy, 1 - use proxy. Or proxy id for Scale plan users
]);

// response
['id' => 500]

Get Scraping Job

// Execute GET request to https://api.webscraper.io/api/v1/scraping-job/<SCRAPING JOB ID>?api_token=<YOUR API TOKEN>

// response
{
	"success": true,
	"data": {
		"id": 500,
		"sitemap_name": "webscraper-io-landing",
		"status": "scheduling",
		"sitemap_id": 123,
		"test_run": 0,
		"jobs_scheduled": 0,
		"jobs_executed": 0,
		"jobs_failed": 0,
		"jobs_empty": 0,
		"stored_record_count": 0,
		"request_interval": 2000,
		"page_load_delay": 2000,
		"driver": "fast",
		"scheduled": 0, // scraping job was started by scheduler
		"time_created": "1493370624", // unix timestamp
	}
}
<?php
$client->getScrapingJob(500);

// response
[
	'id' => 500,
	'sitemap_name' => 'webscraper-io-landing',
	'status' => 'scheduling',
	'sitemap_id' => 123,
	'test_run' => 0,
	'jobs_scheduled' => 0,
	'jobs_executed' => 0,
	'jobs_failed' => 0,
	'jobs_empty' => 0,
	'stored_record_count' => 0,
	'request_interval' => 2000,
	'page_load_delay' => 2000,
	'driver' => 'fast',
	'scheduled' => 0, // scraping job was started by scheduler
	'time_created' => '1493370624', // unix timestamp
]

Get Scraping Jobs

// Execute GET request to https://api.webscraper.io/api/v1/scraping-jobs?api_token=<YOUR API TOKEN>

// Execute GET request to get second page https://api.webscraper.io/api/v1/scraping-jobs?api_token=<YOUR API TOKEN>&page=2

// Execute GET request to to filter by sitemap_id https://api.webscraper.io/api/v1/scraping-jobs?api_token=<YOUR API TOKEN>&page=2&sitemap_id=123

// response
{
	"success": true,
	"data": [
		{
			"id": 500,
			"sitemap_name": "webscraper-io-landing",
			"status": "scheduling",
			"sitemap_id": 123,
			"test_run": 0,
			"jobs_scheduled": 0,
			"jobs_executed": 0,
			"jobs_failed": 0,
			"jobs_empty": 0,
			"stored_record_count": 0,
			"request_interval": 2000,
			"page_load_delay": 2000,
			"driver": "fast",
			"scheduled": 0, // scraping job was started by scheduler
			"time_created": "1493370624", // unix timestamp
		},
		{
		...
		}
	],
	"current_page": 1,
	"last_page": 1,
	"total": 5,
	"per_page": 100,
}
<?php
$client->getScrapingJobs($sitemapId = null);

// response (iterator)
[
	[
		'id' => 500,
		'sitemap_name' => 'webscraper-io-landing',
		'status' => 'scheduling',
		'sitemap_id' => 123,
		'test_run' => 0,
		'jobs_scheduled' => 0,
		'jobs_executed' => 0,
		'jobs_failed' => 0,
		'jobs_empty' => 0,
		'stored_record_count' => 0,
		'request_interval' => 2000,
		'page_load_delay' => 2000,
		'driver' => 'fast',
		'scheduled' => 0, // scraping job was started by scheduler
		'time_created' => '1493370624', // unix timestamp
	],
	[
	...
	],
]

// iterate through all scraping jobs
$scrapingJobs = $client->getScrapingJobs();
foreach($scrapingJobs as $scrapingJob) {
	var_dump($scrapingJob);
}

// iterate through all scraping jobs while manually handling pagination
$iterator = $client->getScrapingJobs();
$page = 1;
do {
	$scrapingJobs = $iterator->getPageData($page);
	foreach($scrapingJobs as $scrapingJob) {
		var_dump($scrapingJob);
	}
	$page++;
} while($page <= $iterator->getLastPage());

Download scraped data in JSON format

// Execute GET request to https://api.webscraper.io/api/v1/scraping-job/<SCRAPING JOB ID>/json?api_token=<YOUR API TOKEN>

// Response - a file with one JSON string per line.
{"title":"Nokia 123","price":"$24.99","description":"7 day battery"}
{"title":"ProBook","price":"$739.99","description":"14\", Core i5 2.6GHz, 4GB, 500GB, Win7 Pro 64bit"}
{"title":"ThinkPad X240","price":"$1311.99","description":"12.5\", Core i5-4300U, 8GB, 240GB SSD, Win7 Pro 64bit"}
{"title":"Aspire E1-572G","price":"$581.99","description":"15.6\", Core i5-4200U, 8GB, 1TB, Radeon R7 M265, Windows 8.1"}
<?php

use WebScraper\ApiClient\Client;
use WebScraper\ApiClient\Reader\JsonReader;

// download file locally
$outputFile = "/tmp/scrapingjob{$scrapingJobId}.json";
$client->downloadScrapingJobJSON($scrapingJobId, $outputFile);

// read data from file with built in JSON reader
$reader = new JsonReader($outputFile);
$rows = $reader->fetchRows();
foreach ($rows as $row) {
	echo "ROW: " . json_encode($row) . "\n";
}

Download scraped data in CSV format

// Execute GET request to https://api.webscraper.io/api/v1/scraping-job/<SCRAPING JOB ID>/csv?api_token=<YOUR API TOKEN>

// NOTE! We recommend using JSON format since multiple CSV notations are
// being used by different products. For example:
// CSV Standard: https://tools.ietf.org/html/rfc4180
// MS Excel cannot handle escape sequences from the CSV standard
// PHP has incorrect default implementation. See https://wiki.php.net/rfc/kill-csv-escaping

// response - A CSV file
web-scraper-order,title,Color
1494492462-1,Fluffy Cat,blue
1494492462-1,Fluffy Dog,white
<?php

use League\Csv\Reader;

// Download CSV
$outputFile = "/tmp/scrapingjob-data{$scrapingJobId}.csv";
$client->downloadScrapingJobCSV($scrapingJobId, $outputFile);

// read CSV file
// NOTE! We recommend using json format since multiple CSV notations are
// being used by different products. For example:
// CSV Standard: https://tools.ietf.org/html/rfc4180
// MS Excel cannot handle escape sequences from the CSV standard
// PHP has incorrect default implementation. See https://wiki.php.net/rfc/kill-csv-escaping
$records = Reader::createFromPath($outputFile)->fetchAssoc();

foreach($records as $record) {
	// Import records into database. Importing records in bulk will speed up
	// the process.
}

Delete Scraping Job

// Execute DELETE request to https://api.webscraper.io/api/v1/scraping-job/<SCRAPING JOB ID>?api_token=<YOUR API TOKEN>

// response
{
	"success": true,
	"data": "ok"
}
<?php
$client->deleteScrapingJob(500);

Account info

// Execute GET request to https://api.webscraper.io/api/v1/account?api_token=<YOUR API TOKEN>

// response
{
	"success": true,
	"data": {
		"email": "user@example.com",
		"firstname": "John",
		"lastname": "Deere",
		"page_credits": 500
	}
}
<?php
$client->getAccountInfo();

// response
[
	'email' => 'user@example.com',
	'firstname' => 'John',
	'lastname' => 'Deere',
	'page_credits' => 500,
]