You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

41 lines
1.2 KiB
Python

from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
import time
from PyGoogleSearch.items import PyGSItem
import sys
class PyGSSpider(CrawlSpider):
'''
docstring for PyGSSpider
'''
name = 'google.com'
allowed_domains = ['google.com']
#start_urls = [
# 'http://www.google.fr/search?sclient=psy&hl=fr&source=hp&q=sexe&btnG=Rechercher'
# ]
rules = (
Rule(SgmlLinkExtractor(restrict_xpaths='//a[@id="pnnext"]',
),
callback='parse_item',
follow=True),
)
def parse_item(self, response):
time.sleep(3)
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[@id="ires"]/ol/li')
items = []
for site in sites:
item = PyGSItem()
item['title'] = site.select('h3[@class="r"]/a/text() | \
h3[@class="r"]/a/em/text()').extract()
item['desc'] = site.select('div/text()').extract()
item['link'] = site.select('h3[@class="r"]/a/@href').extract()
items.append(item)
return items