Fourmi/FourmiCrawler/pipelines.py

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import re
from scrapy.exceptions import DropItem


class DuplicatePipeline(object):

    def __init__(self):
        self.known_values = set()

    def process_item(self, item, spider):
        """
        Processing the items so exact doubles are dropped
        :param item: The incoming item
        :param spider: The spider which scraped the spider
        :return: :raise DropItem: Returns the item if unique or drops them if it's already known
        """
        value = (item['attribute'], item['value'], item['conditions'])
        if value in self.known_values:
            raise DropItem("Duplicate item found: %s" % item) # #[todo] append sources of first item.
        else:
            self.known_values.add(value)
            return item

class AttributeSelectionPipeline(object):

    def __init__(self):
        pass;

    def process_item(self, item, spider):
        """
        The items are processed using the selected attribute list available in the spider,
        items that don't match the selected items are dropped.
        :param item: The incoming item
        :param spider: The spider which scraped the item. Should have an attribute "selected_attributes".
        :return: :raise DropItem: Returns item if it matches an selected attribute, else it is dropped.
        """
        if [x for x in spider.selected_attributes if re.match(x, item["attribute"])]:
            return item
        else:
            raise DropItem("Attribute not selected by used: %s" % item)