initial commit

13 years ago · 5b3f050fd6
commit 5b3f050fd6
15 changed files with 146 additions and 0 deletions
--- a/.scrapy/scrapy.db
+++ b/.scrapy/scrapy.db
--- a/PyGoogleSearch/init.py
+++ b/PyGoogleSearch/init.py
--- a/PyGoogleSearch/init.pyc
+++ b/PyGoogleSearch/init.pyc
--- a/PyGoogleSearch/items.py
+++ b/PyGoogleSearch/items.py
@ -0,0 +1,13 @@
+# Define here the models for your scraped items
+#
+# See documentation in:
+# http://doc.scrapy.org/topics/items.html
+
+from scrapy.item import Item, Field
+
+class PyGSItem(Item):
+    # define the fields for your item here like:
+    # name = Field()
+    title = Field()
+    link = Field()
+    desc = Field()
--- a/PyGoogleSearch/items.pyc
+++ b/PyGoogleSearch/items.pyc
--- a/PyGoogleSearch/pipelines.py
+++ b/PyGoogleSearch/pipelines.py
@ -0,0 +1,8 @@
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: http://doc.scrapy.org/topics/item-pipeline.html
+
+class PygooglesearchPipeline(object):
+    def process_item(self, item, spider):
+        return item
--- a/PyGoogleSearch/settings.py
+++ b/PyGoogleSearch/settings.py
@ -0,0 +1,16 @@
+# Scrapy settings for PyGoogleSearch project
+#
+# For simplicity, this file contains only the most important settings by
+# default. All the other settings are documented here:
+#
+#     http://doc.scrapy.org/topics/settings.html
+#
+
+BOT_NAME = 'PyGoogleSearch'
+BOT_VERSION = '1.0'
+
+SPIDER_MODULES = ['PyGoogleSearch.spiders']
+NEWSPIDER_MODULE = 'PyGoogleSearch.spiders'
+DEFAULT_ITEM_CLASS = 'PyGoogleSearch.items.PygooglesearchItem'
+USER_AGENT = '%s/%s' % (BOT_NAME, BOT_VERSION)
+
--- a/PyGoogleSearch/settings.pyc
+++ b/PyGoogleSearch/settings.pyc
--- a/PyGoogleSearch/spiders/init.py
+++ b/PyGoogleSearch/spiders/init.py
@ -0,0 +1,8 @@
+# This package will contain the spiders of your Scrapy project
+#
+# To create the first spider for your project use this command:
+#
+#   scrapy genspider myspider myspider-domain.com
+#
+# For more info see:
+# http://doc.scrapy.org/topics/spiders.html
--- a/PyGoogleSearch/spiders/init.pyc
+++ b/PyGoogleSearch/spiders/init.pyc
--- a/PyGoogleSearch/spiders/google_spider.py
+++ b/PyGoogleSearch/spiders/google_spider.py
@ -0,0 +1,40 @@
+from scrapy.contrib.spiders import CrawlSpider, Rule
+from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
+from scrapy.selector import HtmlXPathSelector
+import time
+
+from PyGoogleSearch.items import PyGSItem
+import sys
+
+class PyGSSpider(CrawlSpider):
+    '''
+    docstring for PyGSSpider
+    '''
+    name = 'google.com'
+    allowed_domains = ['google.com']
+    #start_urls = [
+    #    'http://www.google.fr/search?sclient=psy&hl=fr&source=hp&q=sexe&btnG=Rechercher'
+    #    ]
+    rules = (
+        Rule(SgmlLinkExtractor(restrict_xpaths='//a[@id="pnnext"]',
+        ), 
+        callback='parse_item',
+        follow=True),
+        )
+
+    def parse_item(self, response):
+        time.sleep(3)
+        hxs = HtmlXPathSelector(response)
+        sites = hxs.select('//div[@id="ires"]/ol/li')
+        items = []
+        for site in sites:
+            item = PyGSItem()
+            
+            item['title'] = site.select('h3[@class="r"]/a/text() | \
+            h3[@class="r"]/a/em/text()').extract()
+            item['desc'] = site.select('div/text()').extract()
+            item['link'] = site.select('h3[@class="r"]/a/@href').extract()
+            items.append(item)
+        return items
+
+        
--- a/PyGoogleSearch/spiders/google_spider.pyc
+++ b/PyGoogleSearch/spiders/google_spider.pyc
--- a/8
+++ b/8
@ -0,0 +1,8 @@
+This project is released to the public domain.
+It is based on Scrapy open source screen scrapping.
+contact: spike@boxls.com
+
+PyGoogleSeach is a simple Google Data crawler. I created a simple command line
+interface to make queries, it exports the results in json format to the file
+data.json.
+
--- a/42
+++ b/42
@ -0,0 +1,42 @@
+#!/usr/bin/env python
+
+import subprocess
+from optparse import OptionParser
+from urllib import quote
+import sys
+
+__VERSION__ = 0.1
+SEARCH_URL ='''\
+http://www.google.com/search?sclient=psy&hl=en&site=&source=hp&q=foo&btnG=Google+Search\
+'''
+SCRAPY_PARAMS = ['--set', 'FEED_URI=data.json', '--set', 'FEED_FORMAT=json']
+
+
+if __name__ == '__main__':
+    usage = 'usage: %prog query [OPTION...]'
+    op = OptionParser(usage, version='%%prog %s' % __VERSION__)
+    opts, args = op.parse_args()
+    #try:
+    if len(args) != 1:
+        print 'Bad arguments: ' 
+        op.print_usage()
+    else:
+        scrapy = 'scrapy'
+        scrapy_cmd = 'crawl'
+        query = args[0]
+        url_query = quote(query)
+        search = SEARCH_URL.replace('foo', url_query)
+        print url_query
+        print 'Launching scrapy with paramters <%s> <%s>' % (scrapy,
+        search)
+        p = subprocess.Popen([
+                            scrapy, scrapy_cmd, 
+                            search, '--set',
+                            'FEED_URI=data.json',
+                            '--set',
+                            'FEED_FORMAT=JSON'
+                            ],
+                                stdout=subprocess.PIPE,
+                                stderr=subprocess.PIPE)
+        output = p.communicate()[0]
+        print output
--- a/scrapy.cfg
+++ b/scrapy.cfg
@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# http://doc.scrapy.org/topics/scrapyd.html
+
+[settings]
+default = PyGoogleSearch.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = PyGoogleSearch