diff --git a/FourmiCrawler/pipelines.py b/FourmiCrawler/pipelines.py index 34217ac..e1dadbf 100644 --- a/FourmiCrawler/pipelines.py +++ b/FourmiCrawler/pipelines.py @@ -6,7 +6,7 @@ import re from scrapy.exceptions import DropItem -class FourmiPipeline(object): +class DuplicatePipeline(object): def __init__(self): self.known_values = set() @@ -18,7 +18,7 @@ class FourmiPipeline(object): :param spider: The spider which scraped the spider :return: :raise DropItem: Returns the item if unique or drops them if it's already known """ - value = item['attribute'], item['value'] + value = (item['attribute'], item['value'], item['conditions']) if value in self.known_values: raise DropItem("Duplicate item found: %s" % item) # #[todo] append sources of first item. else: diff --git a/FourmiCrawler/settings.py b/FourmiCrawler/settings.py index a28cf9a..d7ac212 100644 --- a/FourmiCrawler/settings.py +++ b/FourmiCrawler/settings.py @@ -12,7 +12,7 @@ SPIDER_MODULES = ['FourmiCrawler'] NEWSPIDER_MODULE = 'FourmiCrawler' ITEM_PIPELINES = { 'FourmiCrawler.pipelines.AttributeSelectionPipeline': 100, - 'FourmiCrawler.pipelines.FourmiPipeline': 200, + 'FourmiCrawler.pipelines.DuplicatePipeline': 200, } FEED_URI = 'results.json' FEED_FORMAT = 'jsonlines'