Installation
Install Qt4
# download http://qt.nokia.com/downloads/sdk-linux-x11-32bit-cpp-offline
wget http://www.developer.nokia.com/dp?uri=http%3A%2F%2Fsw.nokia.com%2Fid%2F8ea74da4-fec1-4277-8b26-c58cc82e204b%2FQt_SDK_Lin32_offline
chmod u+x ./QtSdk-offline-linux-x86-v1.2.1.run
sudo ./QtSdk-offline-linux-x86-v1.2.1.run
# install Qt4 Library
sudo apt-get install -y python-lxml qt4-qmake
Python Libraries
# install cssselect - fix "ImportError: No module named cssselect"
sudo pip install cssselect
# install webkit-server
git clone https://github.com/niklasb/webkit-server.git webkit-server
cd webkit-server
sudo python setup.py install
# install dryscrape
# sudo pip install dryscrape
git clone https://github.com/niklasb/dryscrape.git dryscrape
cd dryscrape
sudo python setup.py install
dryscrape_test.py – Test File
cat > dryscrape_test.py <<"_EOF_"
# -*- coding: utf-8 -*-
import dryscrape
search_keyword = 'dryscrape'
# set up a web scraping session
session = dryscrape.Session(base_url = 'http://google.com')
# we don't need images
session.set_attribute('auto_load_images', False)
# visit homepage and search for a term
session.visit('/')
q = session.at_xpath('//*[@name="q"]')
q.set(search_keyword)
q.form().submit()
# extract all links
for link in session.xpath('//a[@href]'):
print link['href']
# save a screenshot of the web page
session.render('google.png')
print("Screenshot written to 'google.png'")