diff --git a/Fourmi.py b/Fourmi.py deleted file mode 100755 index 7c3cf7d..0000000 --- a/Fourmi.py +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/env python -""" -Fourmi - An internet webcrawler searching for information on chemical -compounds. [todo] - Add some more useful text here. -Version: v0.0.1 - Empty Application that could do something but all logic of websites isn't there yet! -""" - -from twisted.internet import reactor -from scrapy.crawler import Crawler -from scrapy import log, signals -from FourmiCrawler.parsers.parser import Parser -from FourmiCrawler.spider import FourmiSpider -from scrapy.utils.project import get_project_settings -import os, inspect, re - -def load_parsers(rel_dir="FourmiCrawler/parsers"): - path = os.path.dirname(os.path.abspath(__file__)) - path += "/" + rel_dir - parsers = [] - known_parser = set() - - for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']: - mod = __import__('.'.join([rel_dir.replace("/", "."), py]), fromlist=[py]) - classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))] - for cls in classes: - if issubclass(cls, Parser) and cls not in known_parser: - parsers.append(cls()) # [review] - Would we ever need arguments for the parsers? - known_parser.add(cls) - return parsers - -def setup_crawler(searchables): - spider = FourmiSpider(compounds=searchables) - spider.add_parsers(load_parsers()) - settings = get_project_settings() - crawler = Crawler(settings) - crawler.signals.connect(reactor.stop, signal=signals.spider_closed) - crawler.configure() - crawler.crawl(spider) - crawler.start() - - -def start(): - setup_crawler(["Methane"]) - log.start() - reactor.run() - - -start() diff --git a/FourmiCrawler/settings.py b/FourmiCrawler/settings.py index b025167..be91fef 100644 --- a/FourmiCrawler/settings.py +++ b/FourmiCrawler/settings.py @@ -13,6 +13,9 @@ NEWSPIDER_MODULE = 'FourmiCrawler' ITEM_PIPELINES = { 'FourmiCrawler.pipelines.FourmiPipeline': 100 } +FEED_URI = 'results.json' +FEED_FORMAT = 'jsonlines' + # Crawl responsibly by identifying yourself (and your website) on the # user-agent diff --git a/FourmiCrawler/spider.py b/FourmiCrawler/spider.py index 327de42..77b2c11 100644 --- a/FourmiCrawler/spider.py +++ b/FourmiCrawler/spider.py @@ -8,12 +8,9 @@ class FourmiSpider(Spider): __parsers = [] synonyms = [] - def __init__(self, compounds=None, *args, **kwargs): + def __init__(self, compound=None, *args, **kwargs): super(FourmiSpider, self).__init__(*args, **kwargs) - if isinstance(compounds, list): - self.synonyms.extend(compounds) - else: - self.synonyms.append(compounds) + self.synonyms.append(compound) def parse(self, reponse): for parser in self.__parsers: diff --git a/fourmi.py b/fourmi.py new file mode 100755 index 0000000..5999f8f --- /dev/null +++ b/fourmi.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python +""" +Fourmi, an webscraper build to search specific information for a given compound. + +Usage: + fourmi search + fourmi [options] search + fourmi -h | --help + fourmi --version + +Options: + -h --help Show this screen. + --version Show version. + --verbose Verbose logging output. + --log= Save log to an file. + -o --output= Output file [default: result.*format*] + -f --format= Output formats (supported: csv, json, jsonlines, xml) [default: jsonlines] +""" + +import os +import inspect + +from twisted.internet import reactor +from scrapy.crawler import Crawler +from scrapy import log, signals +from scrapy.utils.project import get_project_settings +import docopt + +from FourmiCrawler.parsers.parser import Parser +from FourmiCrawler.spider import FourmiSpider + + +def load_parsers(rel_dir="FourmiCrawler/parsers"): + path = os.path.dirname(os.path.abspath(__file__)) + path += "/" + rel_dir + parsers = [] + known_parser = set() + + for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']: + mod = __import__('.'.join([rel_dir.replace("/", "."), py]), fromlist=[py]) + classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))] + for cls in classes: + if issubclass(cls, Parser) and cls not in known_parser: + parsers.append(cls()) # [review] - Would we ever need arguments for the parsers? + known_parser.add(cls) + return parsers + + +def setup_crawler(searchable, settings): + spider = FourmiSpider(compound=searchable) + spider.add_parsers(load_parsers()) + crawler = Crawler(settings) + crawler.signals.connect(reactor.stop, signal=signals.spider_closed) + crawler.configure() + crawler.crawl(spider) + crawler.start() + + +def scrapy_settings_manipulation(arguments): + settings = get_project_settings() + + if arguments["--output"] != 'result.*format*': + settings.overrides["FEED_URI"] = arguments["--output"] + elif arguments["--format"] == "jsonlines": + settings.overrides["FEED_URI"] = "results.json" + elif arguments["--format"] is not None: + settings.overrides["FEED_URI"] = "results." + arguments["--format"] + + if arguments["--format"] is not None: + settings.overrides["FEED_FORMAT"] = arguments["--format"] + + return settings + + +def start_log(arguments): + if arguments["--log"] is not None: + if arguments["--verbose"]: + log.start(logfile=arguments["--log"], logstdout=False, loglevel=log.DEBUG) + else: + log.start(logfile=arguments["--log"], logstdout=True, loglevel=log.WARNING) + else: + if arguments["--verbose"]: + log.start(logstdout=False, loglevel=log.DEBUG) + else: + log.start(logstdout=True, loglevel=log.WARNING) + + +if __name__ == '__main__': + arguments = docopt.docopt(__doc__, version='Fourmi - V0.0.1a') + start_log(arguments) + print arguments + settings = scrapy_settings_manipulation(arguments) + setup_crawler([arguments[""]], settings) + reactor.run() +