commit 5b3f050fd6f30f6b7bd945f6f4dc2f6811830c60 Author: Chakib (Spike) Benziane Date: Sat May 14 14:57:11 2011 +0200 initial commit diff --git a/.scrapy/scrapy.db b/.scrapy/scrapy.db new file mode 100644 index 0000000..423a5b9 Binary files /dev/null and b/.scrapy/scrapy.db differ diff --git a/PyGoogleSearch/__init__.py b/PyGoogleSearch/__init__.py new file mode 100755 index 0000000..e69de29 diff --git a/PyGoogleSearch/__init__.pyc b/PyGoogleSearch/__init__.pyc new file mode 100644 index 0000000..b3b895c Binary files /dev/null and b/PyGoogleSearch/__init__.pyc differ diff --git a/PyGoogleSearch/items.py b/PyGoogleSearch/items.py new file mode 100644 index 0000000..791a003 --- /dev/null +++ b/PyGoogleSearch/items.py @@ -0,0 +1,13 @@ +# Define here the models for your scraped items +# +# See documentation in: +# http://doc.scrapy.org/topics/items.html + +from scrapy.item import Item, Field + +class PyGSItem(Item): + # define the fields for your item here like: + # name = Field() + title = Field() + link = Field() + desc = Field() diff --git a/PyGoogleSearch/items.pyc b/PyGoogleSearch/items.pyc new file mode 100644 index 0000000..2a6f006 Binary files /dev/null and b/PyGoogleSearch/items.pyc differ diff --git a/PyGoogleSearch/pipelines.py b/PyGoogleSearch/pipelines.py new file mode 100644 index 0000000..93dddd9 --- /dev/null +++ b/PyGoogleSearch/pipelines.py @@ -0,0 +1,8 @@ +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: http://doc.scrapy.org/topics/item-pipeline.html + +class PygooglesearchPipeline(object): + def process_item(self, item, spider): + return item diff --git a/PyGoogleSearch/settings.py b/PyGoogleSearch/settings.py new file mode 100644 index 0000000..6f8467d --- /dev/null +++ b/PyGoogleSearch/settings.py @@ -0,0 +1,16 @@ +# Scrapy settings for PyGoogleSearch project +# +# For simplicity, this file contains only the most important settings by +# default. All the other settings are documented here: +# +# http://doc.scrapy.org/topics/settings.html +# + +BOT_NAME = 'PyGoogleSearch' +BOT_VERSION = '1.0' + +SPIDER_MODULES = ['PyGoogleSearch.spiders'] +NEWSPIDER_MODULE = 'PyGoogleSearch.spiders' +DEFAULT_ITEM_CLASS = 'PyGoogleSearch.items.PygooglesearchItem' +USER_AGENT = '%s/%s' % (BOT_NAME, BOT_VERSION) + diff --git a/PyGoogleSearch/settings.pyc b/PyGoogleSearch/settings.pyc new file mode 100644 index 0000000..5c151f9 Binary files /dev/null and b/PyGoogleSearch/settings.pyc differ diff --git a/PyGoogleSearch/spiders/__init__.py b/PyGoogleSearch/spiders/__init__.py new file mode 100755 index 0000000..c930799 --- /dev/null +++ b/PyGoogleSearch/spiders/__init__.py @@ -0,0 +1,8 @@ +# This package will contain the spiders of your Scrapy project +# +# To create the first spider for your project use this command: +# +# scrapy genspider myspider myspider-domain.com +# +# For more info see: +# http://doc.scrapy.org/topics/spiders.html diff --git a/PyGoogleSearch/spiders/__init__.pyc b/PyGoogleSearch/spiders/__init__.pyc new file mode 100644 index 0000000..01efa2d Binary files /dev/null and b/PyGoogleSearch/spiders/__init__.pyc differ diff --git a/PyGoogleSearch/spiders/google_spider.py b/PyGoogleSearch/spiders/google_spider.py new file mode 100644 index 0000000..371ff59 --- /dev/null +++ b/PyGoogleSearch/spiders/google_spider.py @@ -0,0 +1,40 @@ +from scrapy.contrib.spiders import CrawlSpider, Rule +from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor +from scrapy.selector import HtmlXPathSelector +import time + +from PyGoogleSearch.items import PyGSItem +import sys + +class PyGSSpider(CrawlSpider): + ''' + docstring for PyGSSpider + ''' + name = 'google.com' + allowed_domains = ['google.com'] + #start_urls = [ + # 'http://www.google.fr/search?sclient=psy&hl=fr&source=hp&q=sexe&btnG=Rechercher' + # ] + rules = ( + Rule(SgmlLinkExtractor(restrict_xpaths='//a[@id="pnnext"]', + ), + callback='parse_item', + follow=True), + ) + + def parse_item(self, response): + time.sleep(3) + hxs = HtmlXPathSelector(response) + sites = hxs.select('//div[@id="ires"]/ol/li') + items = [] + for site in sites: + item = PyGSItem() + + item['title'] = site.select('h3[@class="r"]/a/text() | \ + h3[@class="r"]/a/em/text()').extract() + item['desc'] = site.select('div/text()').extract() + item['link'] = site.select('h3[@class="r"]/a/@href').extract() + items.append(item) + return items + + diff --git a/PyGoogleSearch/spiders/google_spider.pyc b/PyGoogleSearch/spiders/google_spider.pyc new file mode 100644 index 0000000..829568a Binary files /dev/null and b/PyGoogleSearch/spiders/google_spider.pyc differ diff --git a/README b/README new file mode 100644 index 0000000..b2b71a0 --- /dev/null +++ b/README @@ -0,0 +1,8 @@ +This project is released to the public domain. +It is based on Scrapy open source screen scrapping. +contact: spike@boxls.com + +PyGoogleSeach is a simple Google Data crawler. I created a simple command line +interface to make queries, it exports the results in json format to the file +data.json. + diff --git a/pygs b/pygs new file mode 100755 index 0000000..9601bc7 --- /dev/null +++ b/pygs @@ -0,0 +1,42 @@ +#!/usr/bin/env python + +import subprocess +from optparse import OptionParser +from urllib import quote +import sys + +__VERSION__ = 0.1 +SEARCH_URL ='''\ +http://www.google.com/search?sclient=psy&hl=en&site=&source=hp&q=foo&btnG=Google+Search\ +''' +SCRAPY_PARAMS = ['--set', 'FEED_URI=data.json', '--set', 'FEED_FORMAT=json'] + + +if __name__ == '__main__': + usage = 'usage: %prog query [OPTION...]' + op = OptionParser(usage, version='%%prog %s' % __VERSION__) + opts, args = op.parse_args() + #try: + if len(args) != 1: + print 'Bad arguments: ' + op.print_usage() + else: + scrapy = 'scrapy' + scrapy_cmd = 'crawl' + query = args[0] + url_query = quote(query) + search = SEARCH_URL.replace('foo', url_query) + print url_query + print 'Launching scrapy with paramters <%s> <%s>' % (scrapy, + search) + p = subprocess.Popen([ + scrapy, scrapy_cmd, + search, '--set', + 'FEED_URI=data.json', + '--set', + 'FEED_FORMAT=JSON' + ], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + output = p.communicate()[0] + print output diff --git a/scrapy.cfg b/scrapy.cfg new file mode 100644 index 0000000..de5a1a7 --- /dev/null +++ b/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# http://doc.scrapy.org/topics/scrapyd.html + +[settings] +default = PyGoogleSearch.settings + +[deploy] +#url = http://localhost:6800/ +project = PyGoogleSearch