Fixed Duplicate Pipeline + rename
This commit is contained in:
parent
2e654255c5
commit
f193aac24a
@ -6,7 +6,7 @@ import re
|
|||||||
from scrapy.exceptions import DropItem
|
from scrapy.exceptions import DropItem
|
||||||
|
|
||||||
|
|
||||||
class FourmiPipeline(object):
|
class DuplicatePipeline(object):
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.known_values = set()
|
self.known_values = set()
|
||||||
@ -18,7 +18,7 @@ class FourmiPipeline(object):
|
|||||||
:param spider: The spider which scraped the spider
|
:param spider: The spider which scraped the spider
|
||||||
:return: :raise DropItem: Returns the item if unique or drops them if it's already known
|
:return: :raise DropItem: Returns the item if unique or drops them if it's already known
|
||||||
"""
|
"""
|
||||||
value = item['attribute'], item['value']
|
value = (item['attribute'], item['value'], item['conditions'])
|
||||||
if value in self.known_values:
|
if value in self.known_values:
|
||||||
raise DropItem("Duplicate item found: %s" % item) # #[todo] append sources of first item.
|
raise DropItem("Duplicate item found: %s" % item) # #[todo] append sources of first item.
|
||||||
else:
|
else:
|
||||||
|
@ -12,7 +12,7 @@ SPIDER_MODULES = ['FourmiCrawler']
|
|||||||
NEWSPIDER_MODULE = 'FourmiCrawler'
|
NEWSPIDER_MODULE = 'FourmiCrawler'
|
||||||
ITEM_PIPELINES = {
|
ITEM_PIPELINES = {
|
||||||
'FourmiCrawler.pipelines.AttributeSelectionPipeline': 100,
|
'FourmiCrawler.pipelines.AttributeSelectionPipeline': 100,
|
||||||
'FourmiCrawler.pipelines.FourmiPipeline': 200,
|
'FourmiCrawler.pipelines.DuplicatePipeline': 200,
|
||||||
}
|
}
|
||||||
FEED_URI = 'results.json'
|
FEED_URI = 'results.json'
|
||||||
FEED_FORMAT = 'jsonlines'
|
FEED_FORMAT = 'jsonlines'
|
||||||
|
Reference in New Issue
Block a user