diff --git a/FourmiCrawler/pipelines.py b/FourmiCrawler/pipelines.py index 5f2b68f..e1dadbf 100644 --- a/FourmiCrawler/pipelines.py +++ b/FourmiCrawler/pipelines.py @@ -2,10 +2,11 @@ # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html +import re from scrapy.exceptions import DropItem -class FourmiPipeline(object): +class DuplicatePipeline(object): def __init__(self): self.known_values = set() @@ -17,9 +18,27 @@ class FourmiPipeline(object): :param spider: The spider which scraped the spider :return: :raise DropItem: Returns the item if unique or drops them if it's already known """ - value = item['attribute'], item['value'] + value = (item['attribute'], item['value'], item['conditions']) if value in self.known_values: raise DropItem("Duplicate item found: %s" % item) # #[todo] append sources of first item. else: self.known_values.add(value) return item + +class AttributeSelectionPipeline(object): + + def __init__(self): + pass; + + def process_item(self, item, spider): + """ + The items are processed using the selected attribute list available in the spider, + items that don't match the selected items are dropped. + :param item: The incoming item + :param spider: The spider which scraped the item. Should have an attribute "selected_attributes". + :return: :raise DropItem: Returns item if it matches an selected attribute, else it is dropped. + """ + if [x for x in spider.selected_attributes if re.match(x, item["attribute"])]: + return item + else: + raise DropItem("Attribute not selected by used: %s" % item) \ No newline at end of file diff --git a/FourmiCrawler/settings.py b/FourmiCrawler/settings.py index be91fef..d7ac212 100644 --- a/FourmiCrawler/settings.py +++ b/FourmiCrawler/settings.py @@ -11,7 +11,8 @@ BOT_NAME = 'FourmiCrawler' SPIDER_MODULES = ['FourmiCrawler'] NEWSPIDER_MODULE = 'FourmiCrawler' ITEM_PIPELINES = { - 'FourmiCrawler.pipelines.FourmiPipeline': 100 + 'FourmiCrawler.pipelines.AttributeSelectionPipeline': 100, + 'FourmiCrawler.pipelines.DuplicatePipeline': 200, } FEED_URI = 'results.json' FEED_FORMAT = 'jsonlines' diff --git a/FourmiCrawler/spider.py b/FourmiCrawler/spider.py index 9f92a84..87f22c6 100644 --- a/FourmiCrawler/spider.py +++ b/FourmiCrawler/spider.py @@ -8,9 +8,10 @@ class FourmiSpider(Spider): __parsers = [] synonyms = [] - def __init__(self, compound=None, *args, **kwargs): + def __init__(self, compound=None, selected_attributes=[".*"], *args, **kwargs): super(FourmiSpider, self).__init__(*args, **kwargs) self.synonyms.append(compound) + self.selected_attributes = selected_attributes; def parse(self, reponse): for parser in self.__parsers: diff --git a/fourmi.py b/fourmi.py index e33a833..a9c1d68 100755 --- a/fourmi.py +++ b/fourmi.py @@ -12,14 +12,15 @@ Usage: fourmi --version Options: + --attributes= Include only that match these regular expressions split by a comma. [default: .*] -h --help Show this screen. --version Show version. --verbose Verbose logging output. --log= Save log to an file. -o --output= Output file [default: result.*format*] -f --format= Output formats (supported: csv, json, jsonlines, xml) [default: jsonlines] - --include= Include only sources that match the regular these expressions split by a comma. - --exclude= Exclude the sources that match the regular these expressions split by a comma. + --include= Include only sources that match these regular expressions split by a comma. + --exclude= Exclude the sources that match these regular expressions split by a comma. """ from twisted.internet import reactor @@ -32,8 +33,8 @@ from FourmiCrawler.spider import FourmiSpider from sourceloader import SourceLoader -def setup_crawler(searchable, settings, source_loader): - spider = FourmiSpider(compound=searchable) +def setup_crawler(searchable, settings, source_loader, attributes): + spider = FourmiSpider(compound=searchable, selected_attributes=attributes) spider.add_parsers(source_loader.sources) crawler = Crawler(settings) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) @@ -74,7 +75,7 @@ def start_log(docopt_arguments): def search(docopt_arguments, source_loader): start_log(docopt_arguments) settings = scrapy_settings_manipulation(docopt_arguments) - setup_crawler(docopt_arguments[""], settings, source_loader) + setup_crawler(docopt_arguments[""], settings, source_loader, docopt_arguments["--attributes"].split(',')) reactor.run()