From f193aac24a6cad32534998af5afa3bfee0eded6f Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Thu, 8 May 2014 15:45:42 +0200 Subject: [PATCH] Fixed Duplicate Pipeline + rename --- FourmiCrawler/pipelines.py | 4 ++-- FourmiCrawler/settings.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/FourmiCrawler/pipelines.py b/FourmiCrawler/pipelines.py index 34217ac..e1dadbf 100644 --- a/FourmiCrawler/pipelines.py +++ b/FourmiCrawler/pipelines.py @@ -6,7 +6,7 @@ import re from scrapy.exceptions import DropItem -class FourmiPipeline(object): +class DuplicatePipeline(object): def __init__(self): self.known_values = set() @@ -18,7 +18,7 @@ class FourmiPipeline(object): :param spider: The spider which scraped the spider :return: :raise DropItem: Returns the item if unique or drops them if it's already known """ - value = item['attribute'], item['value'] + value = (item['attribute'], item['value'], item['conditions']) if value in self.known_values: raise DropItem("Duplicate item found: %s" % item) # #[todo] append sources of first item. else: diff --git a/FourmiCrawler/settings.py b/FourmiCrawler/settings.py index a28cf9a..d7ac212 100644 --- a/FourmiCrawler/settings.py +++ b/FourmiCrawler/settings.py @@ -12,7 +12,7 @@ SPIDER_MODULES = ['FourmiCrawler'] NEWSPIDER_MODULE = 'FourmiCrawler' ITEM_PIPELINES = { 'FourmiCrawler.pipelines.AttributeSelectionPipeline': 100, - 'FourmiCrawler.pipelines.FourmiPipeline': 200, + 'FourmiCrawler.pipelines.DuplicatePipeline': 200, } FEED_URI = 'results.json' FEED_FORMAT = 'jsonlines'