#!/usr/bin/env python """ Fourmi, a web scraper build to search specific information for a given compound (and it's pseudonyms). Usage: fourmi fourmi search fourmi [options] search fourmi [options] [-v | -vv | -vvv] [--include= | --exclude=] search fourmi list fourmi [--include= | --exclude=] list fourmi -h | --help fourmi --version Options: --attributes= Include only that match these regular expressions split by a comma. [default: .*] -h --help Show this screen. --version Show version. -v Verbose logging output. (Multiple occurrences increase logging level) --log= Save log to an file. -o --output= Output file [default: results.*format*] -f --format= Output formats (supported: csv, json, jsonlines, xml) [default: csv] --include= Include only sources that match these regular expressions split by a comma. --exclude= Exclude the sources that match these regular expressions split by a comma. """ from twisted.internet import reactor from scrapy.crawler import Crawler from scrapy import signals, log import docopt from FourmiCrawler.spider import FourmiSpider from utils.configurator import Configurator from utils.sourceloader import SourceLoader from GUI import gui def setup_crawler(compound, settings, source_loader, attributes): """ This function prepares and start the crawler which starts the actual search on the internet :param compound: The compound which should be searched :param settings: A scrapy settings object :param source_loader: A fully functional SourceLoader object which contains only the sources that should be used. :param attributes: A list of regular expressions which the attribute names should match. """ spider = FourmiSpider(compound=compound, selected_attributes=attributes) spider.add_sources(source_loader.sources) crawler = Crawler(settings) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.configure() crawler.crawl(spider) crawler.start() def search(docopt_arguments, source_loader): """ The function that facilitates the search for a specific compound. :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments. :param source_loader: An initiated SourceLoader object pointed at the directory with the sources. """ conf = Configurator() conf.set_logging(docopt_arguments["--log"], docopt_arguments["-v"]) conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"]) setup_crawler(docopt_arguments[""], conf.scrapy_settings, source_loader, docopt_arguments["--attributes"].split(',')) if conf.scrapy_settings.getbool("LOG_ENABLED"): log.start(conf.scrapy_settings.get("LOG_FILE"), conf.scrapy_settings.get("LOG_LEVEL"), conf.scrapy_settings.get("LOG_STDOUT")) reactor.run() # The start for the Fourmi Command Line interface. if __name__ == '__main__': arguments = docopt.docopt(__doc__, version='Fourmi - V0.5.3') loader = SourceLoader() if arguments["--include"]: loader.include(arguments["--include"].split(',')) elif arguments["--exclude"]: loader.exclude(arguments["--exclude"].split(',')) if arguments["search"]: search(arguments, loader) elif arguments["list"]: print "-== Available Sources ==-" print str(loader) else: gui_window = gui.GUI(search, sourceloader=SourceLoader()) gui_window.run()