diff --git a/FourmiCrawler/pipelines.py b/FourmiCrawler/pipelines.py index cbf50d3..34217ac 100644 --- a/FourmiCrawler/pipelines.py +++ b/FourmiCrawler/pipelines.py @@ -31,6 +31,13 @@ class AttributeSelectionPipeline(object): pass; def process_item(self, item, spider): + """ + The items are processed using the selected attribute list available in the spider, + items that don't match the selected items are dropped. + :param item: The incoming item + :param spider: The spider which scraped the item. Should have an attribute "selected_attributes". + :return: :raise DropItem: Returns item if it matches an selected attribute, else it is dropped. + """ if [x for x in spider.selected_attributes if re.match(x, item["attribute"])]: return item else: diff --git a/fourmi.py b/fourmi.py index c8afab5..a9c1d68 100755 --- a/fourmi.py +++ b/fourmi.py @@ -22,7 +22,6 @@ Options: --include= Include only sources that match these regular expressions split by a comma. --exclude= Exclude the sources that match these regular expressions split by a comma. """ -import re from twisted.internet import reactor from scrapy.crawler import Crawler