Archived
1
0
This repository has been archived on 2025-03-03. You can view files and clone it, but cannot push or open issues or pull requests.
Fourmi/FourmiCrawler/pipelines.py
2014-05-08 15:45:42 +02:00

44 lines
1.6 KiB
Python

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import re
from scrapy.exceptions import DropItem
class DuplicatePipeline(object):
def __init__(self):
self.known_values = set()
def process_item(self, item, spider):
"""
Processing the items so exact doubles are dropped
:param item: The incoming item
:param spider: The spider which scraped the spider
:return: :raise DropItem: Returns the item if unique or drops them if it's already known
"""
value = (item['attribute'], item['value'], item['conditions'])
if value in self.known_values:
raise DropItem("Duplicate item found: %s" % item) # #[todo] append sources of first item.
else:
self.known_values.add(value)
return item
class AttributeSelectionPipeline(object):
def __init__(self):
pass;
def process_item(self, item, spider):
"""
The items are processed using the selected attribute list available in the spider,
items that don't match the selected items are dropped.
:param item: The incoming item
:param spider: The spider which scraped the item. Should have an attribute "selected_attributes".
:return: :raise DropItem: Returns item if it matches an selected attribute, else it is dropped.
"""
if [x for x in spider.selected_attributes if re.match(x, item["attribute"])]:
return item
else:
raise DropItem("Attribute not selected by used: %s" % item)