diff --git a/.travis.yml b/.travis.yml index 34d3a88..24c5dc5 100644 --- a/.travis.yml +++ b/.travis.yml @@ -10,7 +10,7 @@ install: # command to run tests, e.g. python setup.py test script: - - nosetests --with-coverage --cover-package=FourmiCrawler tests + - nosetests --with-coverage --cover-package=FourmiCrawler,utils tests notifications: slack: descartes2:6sgCzx3PvrO9IIMwKxj12dDM diff --git a/README.md b/README.md index 2b286a0..48b0419 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ # Fourmi -**Master branch**: [![Build Status](https://travis-ci.org/Recondor/Fourmi.svg?branch=master)](https://travis-ci.org/Recondor/Fourmi) +**Master branch**: [![Build Status](https://travis-ci.org/jjdekker/Fourmi.svg?branch=master)](https://travis-ci.org/jjdekker/Fourmi) [![Coverage Status](https://img.shields.io/coveralls/jjdekker/Fourmi.svg)](https://coveralls.io/r/jjdekker/Fourmi?branch=master) -**Developing branch**: [![Build Status](https://travis-ci.org/Recondor/Fourmi.svg?branch=develop)](https://travis-ci.org/Recondor/Fourmi) +**Developing branch**: [![Build Status](https://travis-ci.org/jjdekker/Fourmi.svg?branch=develop)](https://travis-ci.org/jjdekker/Fourmi) [![Coverage Status](https://img.shields.io/coveralls/jjdekker/Fourmi.svg)](https://coveralls.io/r/jjdekker/Fourmi?branch=develop) Fourmi is an web scraper for chemical substances. The program is designed to be used as a search engine to search multiple chemical databases for a specific diff --git a/fourmi.py b/fourmi.py index 3596cf3..68d221a 100755 --- a/fourmi.py +++ b/fourmi.py @@ -17,8 +17,8 @@ Options: --version Show version. --verbose Verbose logging output. --log= Save log to an file. - -o --output= Output file [default: result.*format*] - -f --format= Output formats (supported: csv, json, jsonlines, xml) [default: jsonlines] + -o --output= Output file [default: results.*format*] + -f --format= Output formats (supported: csv, json, jsonlines, xml) [default: csv] --include= Include only sources that match these regular expressions split by a comma. --exclude= Exclude the sources that match these regular expressions split by a comma. """ @@ -30,7 +30,8 @@ from scrapy.utils.project import get_project_settings import docopt from FourmiCrawler.spider import FourmiSpider -from sourceloader import SourceLoader +from utils.configurator import Configurator +from utils.sourceloader import SourceLoader def setup_crawler(compound, settings, source_loader, attributes): @@ -50,53 +51,16 @@ def setup_crawler(compound, settings, source_loader, attributes): crawler.start() -def scrapy_settings_manipulation(docopt_arguments): - """ - This function manipulates the Scrapy settings that normally would be set in the settings file. In the Fourmi - project these are command line arguments. - :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments. - """ - settings = get_project_settings() - - if docopt_arguments["--output"] != 'result.*format*': - settings.overrides["FEED_URI"] = docopt_arguments["--output"] - elif docopt_arguments["--format"] == "jsonlines": - settings.overrides["FEED_URI"] = "results.json" - elif docopt_arguments["--format"] is not None: - settings.overrides["FEED_URI"] = "results." + docopt_arguments["--format"] - - if docopt_arguments["--format"] is not None: - settings.overrides["FEED_FORMAT"] = docopt_arguments["--format"] - - return settings - - -def start_log(docopt_arguments): - """ - This function starts the logging functionality of Scrapy using the settings given by the CLI. - :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments. - """ - if docopt_arguments["--log"] is not None: - if docopt_arguments["--verbose"]: - log.start(logfile=docopt_arguments["--log"], logstdout=False, loglevel=log.DEBUG) - else: - log.start(logfile=docopt_arguments["--log"], logstdout=True, loglevel=log.WARNING) - else: - if docopt_arguments["--verbose"]: - log.start(logstdout=False, loglevel=log.DEBUG) - else: - log.start(logstdout=True, loglevel=log.WARNING) - - def search(docopt_arguments, source_loader): """ The function that facilitates the search for a specific compound. :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments. :param source_loader: An initiated SourceLoader object pointed at the directory with the sources. """ - start_log(docopt_arguments) - settings = scrapy_settings_manipulation(docopt_arguments) - setup_crawler(docopt_arguments[""], settings, source_loader, docopt_arguments["--attributes"].split(',')) + conf = Configurator() + conf.start_log(docopt_arguments["--log"], docopt_arguments["--verbose"]) + conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"]) + setup_crawler(docopt_arguments[""], conf.scrapy_settings, source_loader, docopt_arguments["--attributes"].split(',')) reactor.run() diff --git a/tests/test_configurator.py b/tests/test_configurator.py new file mode 100644 index 0000000..8cc61ea --- /dev/null +++ b/tests/test_configurator.py @@ -0,0 +1,27 @@ +import unittest +from utils.configurator import Configurator + + +class TestConfigurator(unittest.TestCase): + + def setUp(self): + self.conf = Configurator() + + def test_set_output(self): + self.conf.set_output(filename="test.txt", fileformat="csv") + self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.txt") + self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv") + + self.conf.set_output("results.*format*", "jsonlines") + self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.json") + self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "jsonlines") + + self.conf.set_output("results.*format*", "csv") + self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.csv") + self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv") + + # def test_start_log(self): + # self.conf.start_log("test.log", True) + # self.conf.start_log("test.log", False) + # self.conf.start_log(None, True) + # self.conf.start_log(None, False) \ No newline at end of file diff --git a/tests/test_sourceloader.py b/tests/test_sourceloader.py index 1afca2d..9e62057 100644 --- a/tests/test_sourceloader.py +++ b/tests/test_sourceloader.py @@ -1,6 +1,6 @@ import unittest -from sourceloader import SourceLoader +from utils.sourceloader import SourceLoader class TestSourceloader(unittest.TestCase): diff --git a/utils/__init__.py b/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/utils/configurator.py b/utils/configurator.py new file mode 100644 index 0000000..90e0320 --- /dev/null +++ b/utils/configurator.py @@ -0,0 +1,49 @@ +from scrapy import log +from scrapy.utils.project import get_project_settings + + +class Configurator: + """ + A helper class in the fourmi class. This class is used to process the settings as set + from one of the Fourmi applications. + """ + + def __init__(self): + self.scrapy_settings = get_project_settings() + + + def set_output(self, filename, fileformat): + """ + This function manipulates the Scrapy output file settings that normally would be set in the settings file. + In the Fourmi project these are command line arguments. + :param filename: The filename of the file where the output will be put. + :param fileformat: The format in which the output will be. + """ + + if filename != 'results.*format*': + self.scrapy_settings.overrides["FEED_URI"] = filename + elif fileformat == "jsonlines": + self.scrapy_settings.overrides["FEED_URI"] = "results.json" + elif fileformat is not None: + self.scrapy_settings.overrides["FEED_URI"] = "results." + fileformat + + if fileformat is not None: + self.scrapy_settings.overrides["FEED_FORMAT"] = fileformat + + + def start_log(self, logfile, verbose): + """ + This function starts the logging functionality of Scrapy using the settings given by the CLI. + :param logfile: The location where the logfile will be saved. + :param verbose: A boolean value to switch between loglevels. + """ + if logfile is not None: + if verbose: + log.start(logfile=logfile, logstdout=False, loglevel=log.DEBUG) + else: + log.start(logfile=logfile, logstdout=True, loglevel=log.WARNING) + else: + if verbose: + log.start(logstdout=False, loglevel=log.DEBUG) + else: + log.start(logstdout=True, loglevel=log.WARNING) diff --git a/sourceloader.py b/utils/sourceloader.py similarity index 93% rename from sourceloader.py rename to utils/sourceloader.py index 8a6f8b4..07f966f 100644 --- a/sourceloader.py +++ b/utils/sourceloader.py @@ -9,7 +9,7 @@ from FourmiCrawler.sources.source import Source class SourceLoader: sources = [] - def __init__(self, rel_dir="FourmiCrawler/sources"): + def __init__(self, rel_dir="../FourmiCrawler/sources"): """ The initiation of a SourceLoader, selects and indexes a directory for usable sources. Also loads a configuration file for Sources and passes the arguments in @@ -24,7 +24,7 @@ class SourceLoader: config.read('sources.cfg') for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']: - mod = __import__('.'.join([rel_dir.replace("/", "."), py]), fromlist=[py]) + mod = __import__('.'.join([rel_dir.replace("../", "").replace("/", "."), py]), fromlist=[py]) classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))] for cls in classes: if issubclass(cls, Source) and cls not in known_parser: