Archived
1
0
This repository has been archived on 2025-03-03. You can view files and clone it, but cannot push or open issues or pull requests.
Fourmi/fourmi.py
2014-06-15 21:09:43 +02:00

85 lines
3.6 KiB
Python
Executable File

#!/usr/bin/env python
"""
Fourmi, a web scraper build to search specific information for a given compound (and it's pseudonyms).
Usage:
fourmi search <compound>
fourmi [options] search <compound>
fourmi [-v | -vv | -vvv] [options] search <compound>
fourmi [options] [--include=<sourcename> | --exclude=<sourcename>] search <compound>
fourmi list
fourmi [--include=<sourcename> | --exclude=<sourcename>] list
fourmi -h | --help
fourmi --version
Options:
--attributes=<regex> Include only that match these regular expressions split by a comma. [default: .*]
-h --help Show this screen.
--version Show version.
-v Verbose logging output. (Multiple occurrences increase logging level)
--log=<file> Save log to an file.
-o <file> --output=<file> Output file [default: results.*format*]
-f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: csv]
--include=<regex> Include only sources that match these regular expressions split by a comma.
--exclude=<regex> Exclude the sources that match these regular expressions split by a comma.
"""
from twisted.internet import reactor
from scrapy.crawler import Crawler
from scrapy import signals, log
import docopt
from FourmiCrawler.spider import FourmiSpider
from utils.configurator import Configurator
from utils.sourceloader import SourceLoader
def setup_crawler(compound, settings, source_loader, attributes):
"""
This function prepares and start the crawler which starts the actual search on the internet
:param compound: The compound which should be searched
:param settings: A scrapy settings object
:param source_loader: A fully functional SourceLoader object which contains only the sources that should be used.
:param attributes: A list of regular expressions which the attribute names should match.
"""
spider = FourmiSpider(compound=compound, selected_attributes=attributes)
spider.add_sources(source_loader.sources)
crawler = Crawler(settings)
crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
crawler.configure()
crawler.crawl(spider)
crawler.start()
def search(docopt_arguments, source_loader):
"""
The function that facilitates the search for a specific compound.
:param docopt_arguments: A dictionary generated by docopt containing all CLI arguments.
:param source_loader: An initiated SourceLoader object pointed at the directory with the sources.
"""
conf = Configurator()
conf.set_logging(docopt_arguments["--log"], docopt_arguments["-v"])
conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"])
setup_crawler(docopt_arguments["<compound>"], conf.scrapy_settings,
source_loader, docopt_arguments["--attributes"].split(','))
log.start(conf.scrapy_settings.get("LOG_FILE"),
conf.scrapy_settings.get("LOG_LEVEL"), conf.scrapy_settings.get("LOG_STDOUT"))
reactor.run()
# The start for the Fourmi Command Line interface.
if __name__ == '__main__':
arguments = docopt.docopt(__doc__, version='Fourmi - V0.5.1')
loader = SourceLoader()
if arguments["--include"]:
loader.include(arguments["--include"].split(','))
elif arguments["--exclude"]:
loader.exclude(arguments["--exclude"].split(','))
if arguments["search"]:
search(arguments, loader)
elif arguments["list"]:
print "-== Available Sources ==-"
print str(loader)