From c7051331946ddd1e2beceb6f67c0b3160ba24a39 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Thu, 8 May 2014 15:29:47 +0200 Subject: [PATCH] Added CLI functionality to deal with attribute selection --- fourmi.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/fourmi.py b/fourmi.py index e33a833..c8afab5 100755 --- a/fourmi.py +++ b/fourmi.py @@ -12,15 +12,17 @@ Usage: fourmi --version Options: + --attributes= Include only that match these regular expressions split by a comma. [default: .*] -h --help Show this screen. --version Show version. --verbose Verbose logging output. --log= Save log to an file. -o --output= Output file [default: result.*format*] -f --format= Output formats (supported: csv, json, jsonlines, xml) [default: jsonlines] - --include= Include only sources that match the regular these expressions split by a comma. - --exclude= Exclude the sources that match the regular these expressions split by a comma. + --include= Include only sources that match these regular expressions split by a comma. + --exclude= Exclude the sources that match these regular expressions split by a comma. """ +import re from twisted.internet import reactor from scrapy.crawler import Crawler @@ -32,8 +34,8 @@ from FourmiCrawler.spider import FourmiSpider from sourceloader import SourceLoader -def setup_crawler(searchable, settings, source_loader): - spider = FourmiSpider(compound=searchable) +def setup_crawler(searchable, settings, source_loader, attributes): + spider = FourmiSpider(compound=searchable, selected_attributes=attributes) spider.add_parsers(source_loader.sources) crawler = Crawler(settings) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) @@ -74,7 +76,7 @@ def start_log(docopt_arguments): def search(docopt_arguments, source_loader): start_log(docopt_arguments) settings = scrapy_settings_manipulation(docopt_arguments) - setup_crawler(docopt_arguments[""], settings, source_loader) + setup_crawler(docopt_arguments[""], settings, source_loader, docopt_arguments["--attributes"].split(',')) reactor.run()