Archived
1
0

Added an formal pipeline to make sure that we don't supply double values.

This commit is contained in:
Jip J. Dekker 2014-03-17 16:25:48 +01:00
parent 8dd2c168d2
commit 55843d320c
2 changed files with 21 additions and 1 deletions

View File

@ -2,7 +2,24 @@
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy.exceptions import DropItem
class FourmiPipeline(object):
def __init__(self):
self.known_values = set()
def process_item(self, item, spider):
return item
"""
Processing the items so exact doubles are dropped
:param item: The incoming item
:param spider: The spider which scraped the spider
:return: :raise DropItem: Returns the item if unique or drops them if it's already known
"""
value = item['attribute'], item['value']
if value in self.known_values:
raise DropItem("Duplicate item found: %s" % item)
else:
self.known_values.add(value)
return item

View File

@ -10,6 +10,9 @@ BOT_NAME = 'Fourmi'
SPIDER_MODULES = ['Scrapy.spiders']
NEWSPIDER_MODULE = 'Scrapy.spiders'
ITEM_PIPELINES = {
'Scrapy.pipelines.FourmiPipeline': 100
}
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'Fourmi (+http://www.yourdomain.com)'