Added an formal pipeline to make sure that we don't supply double values.

2014-03-17 16:25:48 +01:00 · 2014-03-17 16:25:48 +01:00 · 55843d320c
commit 55843d320c
parent 8dd2c168d2
2 changed files with 21 additions and 1 deletions
--- a/Scrapy/pipelines.py
+++ b/Scrapy/pipelines.py
@ -2,7 +2,24 @@
 #
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+from scrapy.exceptions import DropItem
+

 class FourmiPipeline(object):
+
+    def __init__(self):
+        self.known_values = set()
+
    def process_item(self, item, spider):
-        return item
+        """
+        Processing the items so exact doubles are dropped
+        :param item: The incoming item
+        :param spider: The spider which scraped the spider
+        :return: :raise DropItem: Returns the item if unique or drops them if it's already known
+        """
+        value = item['attribute'], item['value']
+        if value in self.known_values:
+            raise DropItem("Duplicate item found: %s" % item)
+        else:
+            self.known_values.add(value)
+            return item
--- a/Scrapy/settings.py
+++ b/Scrapy/settings.py
@ -10,6 +10,9 @@ BOT_NAME = 'Fourmi'

 SPIDER_MODULES = ['Scrapy.spiders']
 NEWSPIDER_MODULE = 'Scrapy.spiders'
+ITEM_PIPELINES = {
+    'Scrapy.pipelines.FourmiPipeline': 100
+}

 # Crawl responsibly by identifying yourself (and your website) on the user-agent
 #USER_AGENT = 'Fourmi (+http://www.yourdomain.com)'