initial commit

master
Chakib (Spike) Benziane 13 years ago
commit 5b3f050fd6

Binary file not shown.

Binary file not shown.

@ -0,0 +1,13 @@
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/topics/items.html
from scrapy.item import Item, Field
class PyGSItem(Item):
# define the fields for your item here like:
# name = Field()
title = Field()
link = Field()
desc = Field()

Binary file not shown.

@ -0,0 +1,8 @@
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/topics/item-pipeline.html
class PygooglesearchPipeline(object):
def process_item(self, item, spider):
return item

@ -0,0 +1,16 @@
# Scrapy settings for PyGoogleSearch project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
# http://doc.scrapy.org/topics/settings.html
#
BOT_NAME = 'PyGoogleSearch'
BOT_VERSION = '1.0'
SPIDER_MODULES = ['PyGoogleSearch.spiders']
NEWSPIDER_MODULE = 'PyGoogleSearch.spiders'
DEFAULT_ITEM_CLASS = 'PyGoogleSearch.items.PygooglesearchItem'
USER_AGENT = '%s/%s' % (BOT_NAME, BOT_VERSION)

Binary file not shown.

@ -0,0 +1,8 @@
# This package will contain the spiders of your Scrapy project
#
# To create the first spider for your project use this command:
#
# scrapy genspider myspider myspider-domain.com
#
# For more info see:
# http://doc.scrapy.org/topics/spiders.html

@ -0,0 +1,40 @@
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
import time
from PyGoogleSearch.items import PyGSItem
import sys
class PyGSSpider(CrawlSpider):
'''
docstring for PyGSSpider
'''
name = 'google.com'
allowed_domains = ['google.com']
#start_urls = [
# 'http://www.google.fr/search?sclient=psy&hl=fr&source=hp&q=sexe&btnG=Rechercher'
# ]
rules = (
Rule(SgmlLinkExtractor(restrict_xpaths='//a[@id="pnnext"]',
),
callback='parse_item',
follow=True),
)
def parse_item(self, response):
time.sleep(3)
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[@id="ires"]/ol/li')
items = []
for site in sites:
item = PyGSItem()
item['title'] = site.select('h3[@class="r"]/a/text() | \
h3[@class="r"]/a/em/text()').extract()
item['desc'] = site.select('div/text()').extract()
item['link'] = site.select('h3[@class="r"]/a/@href').extract()
items.append(item)
return items

@ -0,0 +1,8 @@
This project is released to the public domain.
It is based on Scrapy open source screen scrapping.
contact: spike@boxls.com
PyGoogleSeach is a simple Google Data crawler. I created a simple command line
interface to make queries, it exports the results in json format to the file
data.json.

42
pygs

@ -0,0 +1,42 @@
#!/usr/bin/env python
import subprocess
from optparse import OptionParser
from urllib import quote
import sys
__VERSION__ = 0.1
SEARCH_URL ='''\
http://www.google.com/search?sclient=psy&hl=en&site=&source=hp&q=foo&btnG=Google+Search\
'''
SCRAPY_PARAMS = ['--set', 'FEED_URI=data.json', '--set', 'FEED_FORMAT=json']
if __name__ == '__main__':
usage = 'usage: %prog query [OPTION...]'
op = OptionParser(usage, version='%%prog %s' % __VERSION__)
opts, args = op.parse_args()
#try:
if len(args) != 1:
print 'Bad arguments: '
op.print_usage()
else:
scrapy = 'scrapy'
scrapy_cmd = 'crawl'
query = args[0]
url_query = quote(query)
search = SEARCH_URL.replace('foo', url_query)
print url_query
print 'Launching scrapy with paramters <%s> <%s>' % (scrapy,
search)
p = subprocess.Popen([
scrapy, scrapy_cmd,
search, '--set',
'FEED_URI=data.json',
'--set',
'FEED_FORMAT=JSON'
],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
output = p.communicate()[0]
print output

@ -0,0 +1,11 @@
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# http://doc.scrapy.org/topics/scrapyd.html
[settings]
default = PyGoogleSearch.settings
[deploy]
#url = http://localhost:6800/
project = PyGoogleSearch
Loading…
Cancel
Save