Archived
1
0
This repository has been archived on 2025-03-03. You can view files and clone it, but cannot push or open issues or pull requests.
Fourmi/fourmi.py
2014-06-21 01:33:52 +02:00

90 lines
3.8 KiB
Python
Executable File

#!/usr/bin/env python
"""
Fourmi, a web scraper build to search specific information for a given compound (and its pseudonyms).
Usage:
fourmi
fourmi search <compound>
fourmi [options] search <compound>
fourmi [options] [-v | -vv | -vvv] [--include=<sourcename> | --exclude=<sourcename>] search <compound>
fourmi list
fourmi [--include=<sourcename> | --exclude=<sourcename>] list
fourmi -h | --help
fourmi --version
Options:
--attributes=<regex> Include only that match these regular expressions split by a comma. [default: .*]
-h --help Show this screen.
--version Show version.
-v Verbose logging output. (Multiple occurrences increase logging level)
--log=<file> Save log to an file.
-o <file> --output=<file> Output file [default: <compound>.*format*]
-f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: csv]
--include=<regex> Include only sources that match these regular expressions split by a comma.
--exclude=<regex> Exclude the sources that match these regular expressions split by a comma.
"""
from twisted.internet import reactor
from scrapy.crawler import Crawler
from scrapy import signals, log
import docopt
from FourmiCrawler.spider import FourmiSpider
from utils.configurator import Configurator
from utils.sourceloader import SourceLoader
from GUI import gui
def setup_crawler(compound, settings, source_loader, attributes):
"""
This function prepares and start the crawler which starts the actual search on the internet
:param compound: The compound which should be searched
:param settings: A scrapy settings object
:param source_loader: A fully functional SourceLoader object which contains only the sources that should be used.
:param attributes: A list of regular expressions which the attribute names should match.
"""
spider = FourmiSpider(compound=compound, selected_attributes=attributes)
spider.add_sources(source_loader.sources)
crawler = Crawler(settings)
crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
crawler.configure()
crawler.crawl(spider)
crawler.start()
def search(docopt_arguments, source_loader):
"""
The function that facilitates the search for a specific compound.
:param docopt_arguments: A dictionary generated by docopt containing all CLI arguments.
:param source_loader: An initiated SourceLoader object pointed at the directory with the sources.
"""
conf = Configurator()
conf.set_logging(docopt_arguments["--log"], docopt_arguments["-v"])
conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"], docopt_arguments["<compound>"])
setup_crawler(docopt_arguments["<compound>"], conf.scrapy_settings,
source_loader, docopt_arguments["--attributes"].split(','))
if conf.scrapy_settings.getbool("LOG_ENABLED"):
log.start(conf.scrapy_settings.get("LOG_FILE"),
conf.scrapy_settings.get("LOG_LEVEL"), conf.scrapy_settings.get("LOG_STDOUT"))
reactor.run()
# The start for the Fourmi Command Line interface.
if __name__ == '__main__':
arguments = docopt.docopt(__doc__, version='Fourmi - V0.6.0')
loader = SourceLoader()
if arguments["--include"]:
loader.include(arguments["--include"].split(','))
elif arguments["--exclude"]:
loader.exclude(arguments["--exclude"].split(','))
if arguments["search"]:
search(arguments, loader)
elif arguments["list"]:
print "-== Available Sources ==-"
print str(loader)
else:
gui_window = gui.GUI(search, sourceloader=SourceLoader())
gui_window.run()