Archived
1
0
This repository has been archived on 2025-03-03. You can view files and clone it, but cannot push or open issues or pull requests.
Fourmi/FourmiCrawler/pipelines.py

26 lines
846 B
Python

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy.exceptions import DropItem
class FourmiPipeline(object):
def __init__(self):
self.known_values = set()
def process_item(self, item, spider):
"""
Processing the items so exact doubles are dropped
:param item: The incoming item
:param spider: The spider which scraped the spider
:return: :raise DropItem: Returns the item if unique or drops them if it's already known
"""
value = item['attribute'], item['value']
if value in self.known_values:
raise DropItem("Duplicate item found: %s" % item)
else:
self.known_values.add(value)
return item