Skip to content

Commit

Permalink
add DirbotItem to items.py so that genspider creates a valid spider
Browse files Browse the repository at this point in the history
  • Loading branch information
stav committed Jun 7, 2012
1 parent 917ef94 commit 2212357
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 21 deletions.
11 changes: 8 additions & 3 deletions dirbot/items.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,15 @@
from scrapy.item import Item, Field

class Website(Item):

class DirbotItem(Item):

name = Field()
url = Field()
description = Field()


class Website(DirbotItem):

url = Field()

def __str__(self):
return "Website: name=%s url=%s" % (self['name'], self['url'])
return "Website: name=%s url=%s" % (self.get('name'), self.get('url'))
1 change: 1 addition & 0 deletions dirbot/pipelines.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from scrapy.exceptions import DropItem


class FilterWordsPipeline(object):
"""A pipeline for filtering out items which contain certain words in their
description"""
Expand Down
1 change: 0 additions & 1 deletion dirbot/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,3 @@
USER_AGENT = '%s/%s' % (BOT_NAME, BOT_VERSION)

ITEM_PIPELINES = ['dirbot.pipelines.FilterWordsPipeline']

37 changes: 20 additions & 17 deletions dirbot/spiders/dmoz.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,22 +3,25 @@

from dirbot.items import Website


class DmozSpider(BaseSpider):
name = "dmoz"
allowed_domains = ["dmoz.org"]
start_urls = [
"http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
"http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/"
]
name = "dmoz"
allowed_domains = ["dmoz.org"]
start_urls = [
"http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
"http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/",
]

def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//ul/li')
items = []

for site in sites:
item = Website()
item['name'] = site.select('a/text()').extract()
item['url'] = site.select('a/@href').extract()
item['description'] = site.select('text()').extract()
items.append(item)

def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//ul/li')
items = []
for site in sites:
item = Website()
item['name'] = site.select('a/text()').extract()
item['url'] = site.select('a/@href').extract()
item['description'] = site.select('text()').extract()
items.append(item)
return items
return items

0 comments on commit 2212357

Please sign in to comment.