From 2e654255c59ff238cae2a374eec27b5c2e6f98bf Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Thu, 8 May 2014 15:35:18 +0200 Subject: [PATCH] Added documentation to the pipeline. --- FourmiCrawler/pipelines.py | 7 +++++++ fourmi.py | 1 - 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/FourmiCrawler/pipelines.py b/FourmiCrawler/pipelines.py index cbf50d3..34217ac 100644 --- a/FourmiCrawler/pipelines.py +++ b/FourmiCrawler/pipelines.py @@ -31,6 +31,13 @@ class AttributeSelectionPipeline(object): pass; def process_item(self, item, spider): + """ + The items are processed using the selected attribute list available in the spider, + items that don't match the selected items are dropped. + :param item: The incoming item + :param spider: The spider which scraped the item. Should have an attribute "selected_attributes". + :return: :raise DropItem: Returns item if it matches an selected attribute, else it is dropped. + """ if [x for x in spider.selected_attributes if re.match(x, item["attribute"])]: return item else: diff --git a/fourmi.py b/fourmi.py index c8afab5..a9c1d68 100755 --- a/fourmi.py +++ b/fourmi.py @@ -22,7 +22,6 @@ Options: --include= Include only sources that match these regular expressions split by a comma. --exclude= Exclude the sources that match these regular expressions split by a comma. """ -import re from twisted.internet import reactor from scrapy.crawler import Crawler