From ee01e697d399b04fcc58290cf0eb69841bbbf20b Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Mon, 14 Apr 2014 20:21:41 +0200 Subject: [PATCH 1/8] Added Docopt as an CLI framework --- Fourmi.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Fourmi.py b/Fourmi.py index 7c3cf7d..a13cfcb 100755 --- a/Fourmi.py +++ b/Fourmi.py @@ -11,7 +11,9 @@ from scrapy import log, signals from FourmiCrawler.parsers.parser import Parser from FourmiCrawler.spider import FourmiSpider from scrapy.utils.project import get_project_settings -import os, inspect, re +import os, inspect +import docopt + def load_parsers(rel_dir="FourmiCrawler/parsers"): path = os.path.dirname(os.path.abspath(__file__)) From 2ad33080c640e590cc0c5296c1025f5adf73fd83 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Mon, 14 Apr 2014 20:45:07 +0200 Subject: [PATCH 2/8] First setup of the CLI, decided on a structure --- Fourmi.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/Fourmi.py b/Fourmi.py index a13cfcb..b2ebcd0 100755 --- a/Fourmi.py +++ b/Fourmi.py @@ -1,8 +1,18 @@ #!/usr/bin/env python """ -Fourmi - An internet webcrawler searching for information on chemical -compounds. [todo] - Add some more useful text here. -Version: v0.0.1 - Empty Application that could do something but all logic of websites isn't there yet! +Fourmi, an webscraper build to search specific information for a given compound. + +Usage: + fourmi search + fourmi [options] search + fourmi -h | --help + fourmi --version + +Options: + -h --help Show this screen. + --version Show version. + --verbose Verbose logging output. + --log= Save log to an file. """ from twisted.internet import reactor From a4dd6e1835c64bbd15e86e149d53790eb6608e73 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Mon, 14 Apr 2014 21:31:20 +0200 Subject: [PATCH 3/8] Made logging work --- Fourmi.py | 36 ++++++++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/Fourmi.py b/Fourmi.py index b2ebcd0..f0218f6 100755 --- a/Fourmi.py +++ b/Fourmi.py @@ -3,8 +3,8 @@ Fourmi, an webscraper build to search specific information for a given compound. Usage: - fourmi search - fourmi [options] search + fourmi search ... + fourmi [options] search ... fourmi -h | --help fourmi --version @@ -15,14 +15,17 @@ Options: --log= Save log to an file. """ +import os +import inspect + from twisted.internet import reactor from scrapy.crawler import Crawler from scrapy import log, signals +from scrapy.utils.project import get_project_settings +import docopt + from FourmiCrawler.parsers.parser import Parser from FourmiCrawler.spider import FourmiSpider -from scrapy.utils.project import get_project_settings -import os, inspect -import docopt def load_parsers(rel_dir="FourmiCrawler/parsers"): @@ -40,6 +43,7 @@ def load_parsers(rel_dir="FourmiCrawler/parsers"): known_parser.add(cls) return parsers + def setup_crawler(searchables): spider = FourmiSpider(compounds=searchables) spider.add_parsers(load_parsers()) @@ -51,10 +55,22 @@ def setup_crawler(searchables): crawler.start() -def start(): - setup_crawler(["Methane"]) - log.start() +def start_log(arguments): + if arguments["--log"] is not None: + if arguments["--verbose"]: + log.start(logfile=arguments["--log"], logstdout=False, loglevel=log.DEBUG) + else: + log.start(logfile=arguments["--log"], logstdout=True, loglevel=log.WARNING) + else: + if arguments["--verbose"]: + log.start(logstdout=False, loglevel=log.DEBUG) + else: + log.start(logstdout=True, loglevel=log.WARNING) + + +if __name__ == '__main__': + arguments = docopt.docopt(__doc__, version='Fourmi - V0.0.1a') + start_log(arguments) + setup_crawler([arguments[""]]) reactor.run() - -start() From ffb386103489bad55e22ec7a4d0c511d4f5d54a7 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Tue, 15 Apr 2014 18:49:30 +0200 Subject: [PATCH 4/8] Search for single compound, filename should be lowercase --- FourmiCrawler/spider.py | 7 ++----- Fourmi.py => fourmi.py | 8 ++++---- 2 files changed, 6 insertions(+), 9 deletions(-) rename Fourmi.py => fourmi.py (93%) diff --git a/FourmiCrawler/spider.py b/FourmiCrawler/spider.py index 327de42..77b2c11 100644 --- a/FourmiCrawler/spider.py +++ b/FourmiCrawler/spider.py @@ -8,12 +8,9 @@ class FourmiSpider(Spider): __parsers = [] synonyms = [] - def __init__(self, compounds=None, *args, **kwargs): + def __init__(self, compound=None, *args, **kwargs): super(FourmiSpider, self).__init__(*args, **kwargs) - if isinstance(compounds, list): - self.synonyms.extend(compounds) - else: - self.synonyms.append(compounds) + self.synonyms.append(compound) def parse(self, reponse): for parser in self.__parsers: diff --git a/Fourmi.py b/fourmi.py similarity index 93% rename from Fourmi.py rename to fourmi.py index f0218f6..76ab2e6 100755 --- a/Fourmi.py +++ b/fourmi.py @@ -3,8 +3,8 @@ Fourmi, an webscraper build to search specific information for a given compound. Usage: - fourmi search ... - fourmi [options] search ... + fourmi search + fourmi [options] search fourmi -h | --help fourmi --version @@ -44,8 +44,8 @@ def load_parsers(rel_dir="FourmiCrawler/parsers"): return parsers -def setup_crawler(searchables): - spider = FourmiSpider(compounds=searchables) +def setup_crawler(searchable): + spider = FourmiSpider(compound=searchable) spider.add_parsers(load_parsers()) settings = get_project_settings() crawler = Crawler(settings) From e65d3a6898f0350f2058d7178eda0c4f2074fc95 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Tue, 15 Apr 2014 18:57:51 +0200 Subject: [PATCH 5/8] Added the options for the Feed exports --- fourmi.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fourmi.py b/fourmi.py index 76ab2e6..005a4f1 100755 --- a/fourmi.py +++ b/fourmi.py @@ -9,10 +9,12 @@ Usage: fourmi --version Options: - -h --help Show this screen. - --version Show version. - --verbose Verbose logging output. - --log= Save log to an file. + -h --help Show this screen. + --version Show version. + --verbose Verbose logging output. + --log= Save log to an file. + -o --output= Output file [default: result.*format*] + -f --format= Output formats [default: jsonlines | supported: csv, json, jsonlines, xml] """ import os From 61ca2520e35ca63e5fb79f2e2205af5bed883701 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Tue, 15 Apr 2014 19:40:54 +0200 Subject: [PATCH 6/8] Added feed export functionality --- FourmiCrawler/settings.py | 3 +++ fourmi.py | 27 ++++++++++++++++++++++----- 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/FourmiCrawler/settings.py b/FourmiCrawler/settings.py index b025167..be91fef 100644 --- a/FourmiCrawler/settings.py +++ b/FourmiCrawler/settings.py @@ -13,6 +13,9 @@ NEWSPIDER_MODULE = 'FourmiCrawler' ITEM_PIPELINES = { 'FourmiCrawler.pipelines.FourmiPipeline': 100 } +FEED_URI = 'results.json' +FEED_FORMAT = 'jsonlines' + # Crawl responsibly by identifying yourself (and your website) on the # user-agent diff --git a/fourmi.py b/fourmi.py index 005a4f1..5999f8f 100755 --- a/fourmi.py +++ b/fourmi.py @@ -14,7 +14,7 @@ Options: --verbose Verbose logging output. --log= Save log to an file. -o --output= Output file [default: result.*format*] - -f --format= Output formats [default: jsonlines | supported: csv, json, jsonlines, xml] + -f --format= Output formats (supported: csv, json, jsonlines, xml) [default: jsonlines] """ import os @@ -41,15 +41,14 @@ def load_parsers(rel_dir="FourmiCrawler/parsers"): classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))] for cls in classes: if issubclass(cls, Parser) and cls not in known_parser: - parsers.append(cls()) # [review] - Would we ever need arguments for the parsers? + parsers.append(cls()) # [review] - Would we ever need arguments for the parsers? known_parser.add(cls) return parsers -def setup_crawler(searchable): +def setup_crawler(searchable, settings): spider = FourmiSpider(compound=searchable) spider.add_parsers(load_parsers()) - settings = get_project_settings() crawler = Crawler(settings) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.configure() @@ -57,6 +56,22 @@ def setup_crawler(searchable): crawler.start() +def scrapy_settings_manipulation(arguments): + settings = get_project_settings() + + if arguments["--output"] != 'result.*format*': + settings.overrides["FEED_URI"] = arguments["--output"] + elif arguments["--format"] == "jsonlines": + settings.overrides["FEED_URI"] = "results.json" + elif arguments["--format"] is not None: + settings.overrides["FEED_URI"] = "results." + arguments["--format"] + + if arguments["--format"] is not None: + settings.overrides["FEED_FORMAT"] = arguments["--format"] + + return settings + + def start_log(arguments): if arguments["--log"] is not None: if arguments["--verbose"]: @@ -73,6 +88,8 @@ def start_log(arguments): if __name__ == '__main__': arguments = docopt.docopt(__doc__, version='Fourmi - V0.0.1a') start_log(arguments) - setup_crawler([arguments[""]]) + print arguments + settings = scrapy_settings_manipulation(arguments) + setup_crawler([arguments[""]], settings) reactor.run() From d770f79a7a12a4f1a5f0dbfa4b3a6b32fe1a0206 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Tue, 15 Apr 2014 19:46:10 +0200 Subject: [PATCH 7/8] Bumped version number --- fourmi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fourmi.py b/fourmi.py index 5999f8f..b6080bc 100755 --- a/fourmi.py +++ b/fourmi.py @@ -86,7 +86,7 @@ def start_log(arguments): if __name__ == '__main__': - arguments = docopt.docopt(__doc__, version='Fourmi - V0.0.1a') + arguments = docopt.docopt(__doc__, version='Fourmi - V0.1.0') start_log(arguments) print arguments settings = scrapy_settings_manipulation(arguments) From 972e5da0d2343199c26f1ca8375291043d1af647 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Tue, 15 Apr 2014 19:48:27 +0200 Subject: [PATCH 8/8] Removed debug code and typos. --- fourmi.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fourmi.py b/fourmi.py index b6080bc..d8bc427 100755 --- a/fourmi.py +++ b/fourmi.py @@ -1,6 +1,6 @@ #!/usr/bin/env python """ -Fourmi, an webscraper build to search specific information for a given compound. +Fourmi, an web scraper build to search specific information for a given compound (and it's pseudonyms). Usage: fourmi search @@ -88,7 +88,6 @@ def start_log(arguments): if __name__ == '__main__': arguments = docopt.docopt(__doc__, version='Fourmi - V0.1.0') start_log(arguments) - print arguments settings = scrapy_settings_manipulation(arguments) setup_crawler([arguments[""]], settings) reactor.run()