You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
41 lines
1.2 KiB
Python
41 lines
1.2 KiB
Python
from scrapy.contrib.spiders import CrawlSpider, Rule
|
|
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
|
|
from scrapy.selector import HtmlXPathSelector
|
|
import time
|
|
|
|
from PyGoogleSearch.items import PyGSItem
|
|
import sys
|
|
|
|
class PyGSSpider(CrawlSpider):
|
|
'''
|
|
docstring for PyGSSpider
|
|
'''
|
|
name = 'google.com'
|
|
allowed_domains = ['google.com']
|
|
#start_urls = [
|
|
# 'http://www.google.fr/search?sclient=psy&hl=fr&source=hp&q=sexe&btnG=Rechercher'
|
|
# ]
|
|
rules = (
|
|
Rule(SgmlLinkExtractor(restrict_xpaths='//a[@id="pnnext"]',
|
|
),
|
|
callback='parse_item',
|
|
follow=True),
|
|
)
|
|
|
|
def parse_item(self, response):
|
|
time.sleep(3)
|
|
hxs = HtmlXPathSelector(response)
|
|
sites = hxs.select('//div[@id="ires"]/ol/li')
|
|
items = []
|
|
for site in sites:
|
|
item = PyGSItem()
|
|
|
|
item['title'] = site.select('h3[@class="r"]/a/text() | \
|
|
h3[@class="r"]/a/em/text()').extract()
|
|
item['desc'] = site.select('div/text()').extract()
|
|
item['link'] = site.select('h3[@class="r"]/a/@href').extract()
|
|
items.append(item)
|
|
return items
|
|
|
|
|