Added an formal pipeline to make sure that we don't supply double values.
This commit is contained in:
parent
8dd2c168d2
commit
55843d320c
@ -2,7 +2,24 @@
|
||||
#
|
||||
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||||
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
from scrapy.exceptions import DropItem
|
||||
|
||||
|
||||
class FourmiPipeline(object):
|
||||
|
||||
def __init__(self):
|
||||
self.known_values = set()
|
||||
|
||||
def process_item(self, item, spider):
|
||||
return item
|
||||
"""
|
||||
Processing the items so exact doubles are dropped
|
||||
:param item: The incoming item
|
||||
:param spider: The spider which scraped the spider
|
||||
:return: :raise DropItem: Returns the item if unique or drops them if it's already known
|
||||
"""
|
||||
value = item['attribute'], item['value']
|
||||
if value in self.known_values:
|
||||
raise DropItem("Duplicate item found: %s" % item)
|
||||
else:
|
||||
self.known_values.add(value)
|
||||
return item
|
||||
|
@ -10,6 +10,9 @@ BOT_NAME = 'Fourmi'
|
||||
|
||||
SPIDER_MODULES = ['Scrapy.spiders']
|
||||
NEWSPIDER_MODULE = 'Scrapy.spiders'
|
||||
ITEM_PIPELINES = {
|
||||
'Scrapy.pipelines.FourmiPipeline': 100
|
||||
}
|
||||
|
||||
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||
#USER_AGENT = 'Fourmi (+http://www.yourdomain.com)'
|
||||
|
Reference in New Issue
Block a user