Added an formal pipeline to make sure that we don't supply double values.
This commit is contained in:
parent
8dd2c168d2
commit
55843d320c
@ -2,7 +2,24 @@
|
|||||||
#
|
#
|
||||||
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||||||
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
|
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
|
||||||
|
from scrapy.exceptions import DropItem
|
||||||
|
|
||||||
|
|
||||||
class FourmiPipeline(object):
|
class FourmiPipeline(object):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.known_values = set()
|
||||||
|
|
||||||
def process_item(self, item, spider):
|
def process_item(self, item, spider):
|
||||||
|
"""
|
||||||
|
Processing the items so exact doubles are dropped
|
||||||
|
:param item: The incoming item
|
||||||
|
:param spider: The spider which scraped the spider
|
||||||
|
:return: :raise DropItem: Returns the item if unique or drops them if it's already known
|
||||||
|
"""
|
||||||
|
value = item['attribute'], item['value']
|
||||||
|
if value in self.known_values:
|
||||||
|
raise DropItem("Duplicate item found: %s" % item)
|
||||||
|
else:
|
||||||
|
self.known_values.add(value)
|
||||||
return item
|
return item
|
||||||
|
@ -10,6 +10,9 @@ BOT_NAME = 'Fourmi'
|
|||||||
|
|
||||||
SPIDER_MODULES = ['Scrapy.spiders']
|
SPIDER_MODULES = ['Scrapy.spiders']
|
||||||
NEWSPIDER_MODULE = 'Scrapy.spiders'
|
NEWSPIDER_MODULE = 'Scrapy.spiders'
|
||||||
|
ITEM_PIPELINES = {
|
||||||
|
'Scrapy.pipelines.FourmiPipeline': 100
|
||||||
|
}
|
||||||
|
|
||||||
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||||
#USER_AGENT = 'Fourmi (+http://www.yourdomain.com)'
|
#USER_AGENT = 'Fourmi (+http://www.yourdomain.com)'
|
||||||
|
Reference in New Issue
Block a user