Archived
1
0

Added an Pipeline to deal with attribute selection

This commit is contained in:
Jip J. Dekker 2014-05-08 15:20:17 +02:00
parent 81886981a3
commit 2fcec009bb
3 changed files with 16 additions and 2 deletions

View File

@ -2,6 +2,7 @@
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import re
from scrapy.exceptions import DropItem
@ -23,3 +24,14 @@ class FourmiPipeline(object):
else:
self.known_values.add(value)
return item
class AttributeSelectionPipeline(object):
def __init__(self):
pass;
def process_item(self, item, spider):
if [x for x in spider.selected_attributes if re.match(x, item["attribute"])]:
return item
else:
raise DropItem("Attribute not selected by used: %s" % item)

View File

@ -11,7 +11,8 @@ BOT_NAME = 'FourmiCrawler'
SPIDER_MODULES = ['FourmiCrawler']
NEWSPIDER_MODULE = 'FourmiCrawler'
ITEM_PIPELINES = {
'FourmiCrawler.pipelines.FourmiPipeline': 100
'FourmiCrawler.pipelines.AttributeSelectionPipeline': 100,
'FourmiCrawler.pipelines.FourmiPipeline': 200,
}
FEED_URI = 'results.json'
FEED_FORMAT = 'jsonlines'

View File

@ -8,9 +8,10 @@ class FourmiSpider(Spider):
__parsers = []
synonyms = []
def __init__(self, compound=None, *args, **kwargs):
def __init__(self, compound=None, selected_attributes=[".*"], *args, **kwargs):
super(FourmiSpider, self).__init__(*args, **kwargs)
self.synonyms.append(compound)
self.selected_attributes = selected_attributes;
def parse(self, reponse):
for parser in self.__parsers: