Archived
1
0

Fixed Duplicate Pipeline + rename

This commit is contained in:
Jip J. Dekker 2014-05-08 15:45:42 +02:00
parent 2e654255c5
commit f193aac24a
2 changed files with 3 additions and 3 deletions

View File

@ -6,7 +6,7 @@ import re
from scrapy.exceptions import DropItem from scrapy.exceptions import DropItem
class FourmiPipeline(object): class DuplicatePipeline(object):
def __init__(self): def __init__(self):
self.known_values = set() self.known_values = set()
@ -18,7 +18,7 @@ class FourmiPipeline(object):
:param spider: The spider which scraped the spider :param spider: The spider which scraped the spider
:return: :raise DropItem: Returns the item if unique or drops them if it's already known :return: :raise DropItem: Returns the item if unique or drops them if it's already known
""" """
value = item['attribute'], item['value'] value = (item['attribute'], item['value'], item['conditions'])
if value in self.known_values: if value in self.known_values:
raise DropItem("Duplicate item found: %s" % item) # #[todo] append sources of first item. raise DropItem("Duplicate item found: %s" % item) # #[todo] append sources of first item.
else: else:

View File

@ -12,7 +12,7 @@ SPIDER_MODULES = ['FourmiCrawler']
NEWSPIDER_MODULE = 'FourmiCrawler' NEWSPIDER_MODULE = 'FourmiCrawler'
ITEM_PIPELINES = { ITEM_PIPELINES = {
'FourmiCrawler.pipelines.AttributeSelectionPipeline': 100, 'FourmiCrawler.pipelines.AttributeSelectionPipeline': 100,
'FourmiCrawler.pipelines.FourmiPipeline': 200, 'FourmiCrawler.pipelines.DuplicatePipeline': 200,
} }
FEED_URI = 'results.json' FEED_URI = 'results.json'
FEED_FORMAT = 'jsonlines' FEED_FORMAT = 'jsonlines'