diff --git a/Scrapy/pipelines.py b/Scrapy/pipelines.py index 3345787..3194d7e 100644 --- a/Scrapy/pipelines.py +++ b/Scrapy/pipelines.py @@ -2,7 +2,24 @@ # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html +from scrapy.exceptions import DropItem + class FourmiPipeline(object): + + def __init__(self): + self.known_values = set() + def process_item(self, item, spider): - return item + """ + Processing the items so exact doubles are dropped + :param item: The incoming item + :param spider: The spider which scraped the spider + :return: :raise DropItem: Returns the item if unique or drops them if it's already known + """ + value = item['attribute'], item['value'] + if value in self.known_values: + raise DropItem("Duplicate item found: %s" % item) + else: + self.known_values.add(value) + return item diff --git a/Scrapy/settings.py b/Scrapy/settings.py index e43aa2b..fd379a9 100644 --- a/Scrapy/settings.py +++ b/Scrapy/settings.py @@ -10,6 +10,9 @@ BOT_NAME = 'Fourmi' SPIDER_MODULES = ['Scrapy.spiders'] NEWSPIDER_MODULE = 'Scrapy.spiders' +ITEM_PIPELINES = { + 'Scrapy.pipelines.FourmiPipeline': 100 +} # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'Fourmi (+http://www.yourdomain.com)'