From 2fcec009bb3569da9ee788d01d178cf27b9b891f Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Thu, 8 May 2014 15:20:17 +0200 Subject: [PATCH] Added an Pipeline to deal with attribute selection --- FourmiCrawler/pipelines.py | 12 ++++++++++++ FourmiCrawler/settings.py | 3 ++- FourmiCrawler/spider.py | 3 ++- 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/FourmiCrawler/pipelines.py b/FourmiCrawler/pipelines.py index 5f2b68f..cbf50d3 100644 --- a/FourmiCrawler/pipelines.py +++ b/FourmiCrawler/pipelines.py @@ -2,6 +2,7 @@ # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html +import re from scrapy.exceptions import DropItem @@ -23,3 +24,14 @@ class FourmiPipeline(object): else: self.known_values.add(value) return item + +class AttributeSelectionPipeline(object): + + def __init__(self): + pass; + + def process_item(self, item, spider): + if [x for x in spider.selected_attributes if re.match(x, item["attribute"])]: + return item + else: + raise DropItem("Attribute not selected by used: %s" % item) \ No newline at end of file diff --git a/FourmiCrawler/settings.py b/FourmiCrawler/settings.py index be91fef..a28cf9a 100644 --- a/FourmiCrawler/settings.py +++ b/FourmiCrawler/settings.py @@ -11,7 +11,8 @@ BOT_NAME = 'FourmiCrawler' SPIDER_MODULES = ['FourmiCrawler'] NEWSPIDER_MODULE = 'FourmiCrawler' ITEM_PIPELINES = { - 'FourmiCrawler.pipelines.FourmiPipeline': 100 + 'FourmiCrawler.pipelines.AttributeSelectionPipeline': 100, + 'FourmiCrawler.pipelines.FourmiPipeline': 200, } FEED_URI = 'results.json' FEED_FORMAT = 'jsonlines' diff --git a/FourmiCrawler/spider.py b/FourmiCrawler/spider.py index 9f92a84..87f22c6 100644 --- a/FourmiCrawler/spider.py +++ b/FourmiCrawler/spider.py @@ -8,9 +8,10 @@ class FourmiSpider(Spider): __parsers = [] synonyms = [] - def __init__(self, compound=None, *args, **kwargs): + def __init__(self, compound=None, selected_attributes=[".*"], *args, **kwargs): super(FourmiSpider, self).__init__(*args, **kwargs) self.synonyms.append(compound) + self.selected_attributes = selected_attributes; def parse(self, reponse): for parser in self.__parsers: