From 61ca2520e35ca63e5fb79f2e2205af5bed883701 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Tue, 15 Apr 2014 19:40:54 +0200 Subject: [PATCH] Added feed export functionality --- FourmiCrawler/settings.py | 3 +++ fourmi.py | 27 ++++++++++++++++++++++----- 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/FourmiCrawler/settings.py b/FourmiCrawler/settings.py index b025167..be91fef 100644 --- a/FourmiCrawler/settings.py +++ b/FourmiCrawler/settings.py @@ -13,6 +13,9 @@ NEWSPIDER_MODULE = 'FourmiCrawler' ITEM_PIPELINES = { 'FourmiCrawler.pipelines.FourmiPipeline': 100 } +FEED_URI = 'results.json' +FEED_FORMAT = 'jsonlines' + # Crawl responsibly by identifying yourself (and your website) on the # user-agent diff --git a/fourmi.py b/fourmi.py index 005a4f1..5999f8f 100755 --- a/fourmi.py +++ b/fourmi.py @@ -14,7 +14,7 @@ Options: --verbose Verbose logging output. --log= Save log to an file. -o --output= Output file [default: result.*format*] - -f --format= Output formats [default: jsonlines | supported: csv, json, jsonlines, xml] + -f --format= Output formats (supported: csv, json, jsonlines, xml) [default: jsonlines] """ import os @@ -41,15 +41,14 @@ def load_parsers(rel_dir="FourmiCrawler/parsers"): classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))] for cls in classes: if issubclass(cls, Parser) and cls not in known_parser: - parsers.append(cls()) # [review] - Would we ever need arguments for the parsers? + parsers.append(cls()) # [review] - Would we ever need arguments for the parsers? known_parser.add(cls) return parsers -def setup_crawler(searchable): +def setup_crawler(searchable, settings): spider = FourmiSpider(compound=searchable) spider.add_parsers(load_parsers()) - settings = get_project_settings() crawler = Crawler(settings) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.configure() @@ -57,6 +56,22 @@ def setup_crawler(searchable): crawler.start() +def scrapy_settings_manipulation(arguments): + settings = get_project_settings() + + if arguments["--output"] != 'result.*format*': + settings.overrides["FEED_URI"] = arguments["--output"] + elif arguments["--format"] == "jsonlines": + settings.overrides["FEED_URI"] = "results.json" + elif arguments["--format"] is not None: + settings.overrides["FEED_URI"] = "results." + arguments["--format"] + + if arguments["--format"] is not None: + settings.overrides["FEED_FORMAT"] = arguments["--format"] + + return settings + + def start_log(arguments): if arguments["--log"] is not None: if arguments["--verbose"]: @@ -73,6 +88,8 @@ def start_log(arguments): if __name__ == '__main__': arguments = docopt.docopt(__doc__, version='Fourmi - V0.0.1a') start_log(arguments) - setup_crawler([arguments[""]]) + print arguments + settings = scrapy_settings_manipulation(arguments) + setup_crawler([arguments[""]], settings) reactor.run()