If you are crawling a page that requires you to render the javascript on the page to scrape the data you need, then we can fetch these pages using a headless browser. To render javascript, simply set render=true and we will use a headless Google Chrome instance to fetch the page. This feature is available on all plans.
API REQUEST
import requestspayload ={'api_key':'APIKEY','url':'https://httpbin.org/ip','render':'true'}r = requests.get('http://api.scraperapi.com', params=payload)print(r.text)# Scrapy users can simply replace the urls in their start_urls and parse function# ...other scrapy setup codestart_urls = ['http://api.scraperapi.com?api_key=APIKEY&url='+ url +'&render=true']defparse(self,response):# ...your parsing logic hereyield scrapy.Request('http://api.scraperapi.com/?api_key=APIKEY&url='+ url +'&render=true', self.parse)
PROXY MODE
import requestsproxies ={"http":"http://scraperapi.render=true:APIKEY@proxy-server.scraperapi.com:8001"}r = requests.get('http://httpbin.org/ip', proxies=proxies, verify=False)print(r.text)# Scrapy users can likewise simply pass their API key in headers.# NB: Scrapy skips SSL verification by default.# ...other scrapy setup codestart_urls = ['http://httpbin.org/ip']meta ={"proxy":"http://scraperapi.render=true:APIKEY@proxy-server.scraperapi.com:8001"}defparse(self,response):# ...your parsing logic hereyield scrapy.Request(url, callback=self.parse, headers=headers, meta=meta)
SDK Method
from scraperapi_sdk import ScraperAPIClientclient =ScraperAPIClient('APIKEY')result = client.get(url ='http://httpbin.org/ip', render=true).textprint(result)# Scrapy users can simply replace the urls in their start_urls and parse function# Note for Scrapy, you should not use DOWNLOAD_DELAY and# RANDOMIZE_DOWNLOAD_DELAY, these will lower your concurrency and are not# needed with our API# ...other scrapy setup codestart_urls =[client.scrapyGet(url ='http://httpbin.org/ip', render=true)]defparse(self,response):# ...your parsing logic hereyield scrapy.Request(client.scrapyGet(url ='http://httpbin.org/ip', render=true), self.parse)