From 13305f400b8cf6586541431bc6030e2de0fc7f79 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sat, 7 Jun 2014 11:57:59 +0200 Subject: [PATCH 01/15] Added coverage to the README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 2b286a0..ef612f6 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ # Fourmi -**Master branch**: [![Build Status](https://travis-ci.org/Recondor/Fourmi.svg?branch=master)](https://travis-ci.org/Recondor/Fourmi) +**Master branch**: [![Build Status](https://travis-ci.org/Recondor/Fourmi.svg?branch=master)](https://travis-ci.org/Recondor/Fourmi) [![Coverage Status](https://img.shields.io/coveralls/Recondor/Fourmi.svg)](https://coveralls.io/r/Recondor/Fourmi?branch=master) -**Developing branch**: [![Build Status](https://travis-ci.org/Recondor/Fourmi.svg?branch=develop)](https://travis-ci.org/Recondor/Fourmi) +**Developing branch**: [![Build Status](https://travis-ci.org/Recondor/Fourmi.svg?branch=develop)](https://travis-ci.org/Recondor/Fourmi) [![Coverage Status](https://img.shields.io/coveralls/Recondor/Fourmi.svg)](https://coveralls.io/r/Recondor/Fourmi?branch=develop) Fourmi is an web scraper for chemical substances. The program is designed to be used as a search engine to search multiple chemical databases for a specific From 071018cbac4ca513ef5004e48acf4247d68efa20 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 8 Jun 2014 11:42:01 +0200 Subject: [PATCH 02/15] Made a different python package for all helpers/utils --- fourmi.py | 2 +- tests/test_sourceloader.py | 2 +- utils/__init__.py | 0 sourceloader.py => utils/sourceloader.py | 0 4 files changed, 2 insertions(+), 2 deletions(-) create mode 100644 utils/__init__.py rename sourceloader.py => utils/sourceloader.py (100%) diff --git a/fourmi.py b/fourmi.py index 3596cf3..e11009e 100755 --- a/fourmi.py +++ b/fourmi.py @@ -30,7 +30,7 @@ from scrapy.utils.project import get_project_settings import docopt from FourmiCrawler.spider import FourmiSpider -from sourceloader import SourceLoader +from utils.sourceloader import SourceLoader def setup_crawler(compound, settings, source_loader, attributes): diff --git a/tests/test_sourceloader.py b/tests/test_sourceloader.py index 1afca2d..9e62057 100644 --- a/tests/test_sourceloader.py +++ b/tests/test_sourceloader.py @@ -1,6 +1,6 @@ import unittest -from sourceloader import SourceLoader +from utils.sourceloader import SourceLoader class TestSourceloader(unittest.TestCase): diff --git a/utils/__init__.py b/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/sourceloader.py b/utils/sourceloader.py similarity index 100% rename from sourceloader.py rename to utils/sourceloader.py From edc91c227941f2de3047a06c51f9927e907ff7ff Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 8 Jun 2014 12:04:33 +0200 Subject: [PATCH 03/15] Sourceloader should import dynamically --- utils/sourceloader.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/utils/sourceloader.py b/utils/sourceloader.py index 2ed50a8..b6bb0fd 100644 --- a/utils/sourceloader.py +++ b/utils/sourceloader.py @@ -8,7 +8,7 @@ from FourmiCrawler.sources.source import Source class SourceLoader: sources = [] - def __init__(self, rel_dir="FourmiCrawler/sources"): + def __init__(self, rel_dir="../FourmiCrawler/sources"): """ The initiation of a SourceLoader, selects and indexes a directory for usable sources. :param rel_dir: A relative path to a directory. @@ -18,7 +18,7 @@ class SourceLoader: known_parser = set() for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']: - mod = __import__('.'.join([rel_dir.replace("/", "."), py]), fromlist=[py]) + mod = __import__('.'.join([rel_dir.replace("../", "").replace("/", "."), py]), fromlist=[py]) classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))] for cls in classes: if issubclass(cls, Source) and cls not in known_parser: From 7cafdac7a038fc9a1c5d86d17aeb6026ec220bac Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 8 Jun 2014 12:08:02 +0200 Subject: [PATCH 04/15] Test all python files --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 34d3a88..f208964 100644 --- a/.travis.yml +++ b/.travis.yml @@ -10,7 +10,7 @@ install: # command to run tests, e.g. python setup.py test script: - - nosetests --with-coverage --cover-package=FourmiCrawler tests + - nosetests --with-coverage tests notifications: slack: descartes2:6sgCzx3PvrO9IIMwKxj12dDM From ee80c6eaa0766115d76322a90c3f92b9635897ab Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 8 Jun 2014 12:13:04 +0200 Subject: [PATCH 05/15] We don't want to check coverage for code we didn't write --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index f208964..24c5dc5 100644 --- a/.travis.yml +++ b/.travis.yml @@ -10,7 +10,7 @@ install: # command to run tests, e.g. python setup.py test script: - - nosetests --with-coverage tests + - nosetests --with-coverage --cover-package=FourmiCrawler,utils tests notifications: slack: descartes2:6sgCzx3PvrO9IIMwKxj12dDM From 007549aad815f0a5db9b8ce67c49b54cf1134419 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 8 Jun 2014 12:40:07 +0200 Subject: [PATCH 06/15] Github Username Change --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index ef612f6..8cb8d10 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ # Fourmi -**Master branch**: [![Build Status](https://travis-ci.org/Recondor/Fourmi.svg?branch=master)](https://travis-ci.org/Recondor/Fourmi) [![Coverage Status](https://img.shields.io/coveralls/Recondor/Fourmi.svg)](https://coveralls.io/r/Recondor/Fourmi?branch=master) +**Master branch**: [![Build Status](https://travis-ci.org/jjdekker/Fourmi.svg?branch=master)](https://travis-ci.org/Recondor/Fourmi) [![Coverage Status](https://img.shields.io/coveralls/jjdekker/Fourmi.svg)](https://coveralls.io/r/Recondor/Fourmi?branch=master) -**Developing branch**: [![Build Status](https://travis-ci.org/Recondor/Fourmi.svg?branch=develop)](https://travis-ci.org/Recondor/Fourmi) [![Coverage Status](https://img.shields.io/coveralls/Recondor/Fourmi.svg)](https://coveralls.io/r/Recondor/Fourmi?branch=develop) +**Developing branch**: [![Build Status](https://travis-ci.org/jjdekker/Fourmi.svg?branch=develop)](https://travis-ci.org/Recondor/Fourmi) [![Coverage Status](https://img.shields.io/coveralls/jjdekker/Fourmi.svg)](https://coveralls.io/r/Recondor/Fourmi?branch=develop) Fourmi is an web scraper for chemical substances. The program is designed to be used as a search engine to search multiple chemical databases for a specific From 90129f41ccf1db9363736ec3373a30d5a2c56d4f Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 8 Jun 2014 12:42:21 +0200 Subject: [PATCH 07/15] Added the configuration of the scrapy settings as a new module --- fourmi.py | 46 +++++-------------------------------------- utils/configurator.py | 43 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 41 deletions(-) create mode 100644 utils/configurator.py diff --git a/fourmi.py b/fourmi.py index e11009e..30b5a03 100755 --- a/fourmi.py +++ b/fourmi.py @@ -30,6 +30,7 @@ from scrapy.utils.project import get_project_settings import docopt from FourmiCrawler.spider import FourmiSpider +from utils.configurator import Configurator from utils.sourceloader import SourceLoader @@ -50,53 +51,16 @@ def setup_crawler(compound, settings, source_loader, attributes): crawler.start() -def scrapy_settings_manipulation(docopt_arguments): - """ - This function manipulates the Scrapy settings that normally would be set in the settings file. In the Fourmi - project these are command line arguments. - :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments. - """ - settings = get_project_settings() - - if docopt_arguments["--output"] != 'result.*format*': - settings.overrides["FEED_URI"] = docopt_arguments["--output"] - elif docopt_arguments["--format"] == "jsonlines": - settings.overrides["FEED_URI"] = "results.json" - elif docopt_arguments["--format"] is not None: - settings.overrides["FEED_URI"] = "results." + docopt_arguments["--format"] - - if docopt_arguments["--format"] is not None: - settings.overrides["FEED_FORMAT"] = docopt_arguments["--format"] - - return settings - - -def start_log(docopt_arguments): - """ - This function starts the logging functionality of Scrapy using the settings given by the CLI. - :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments. - """ - if docopt_arguments["--log"] is not None: - if docopt_arguments["--verbose"]: - log.start(logfile=docopt_arguments["--log"], logstdout=False, loglevel=log.DEBUG) - else: - log.start(logfile=docopt_arguments["--log"], logstdout=True, loglevel=log.WARNING) - else: - if docopt_arguments["--verbose"]: - log.start(logstdout=False, loglevel=log.DEBUG) - else: - log.start(logstdout=True, loglevel=log.WARNING) - - def search(docopt_arguments, source_loader): """ The function that facilitates the search for a specific compound. :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments. :param source_loader: An initiated SourceLoader object pointed at the directory with the sources. """ - start_log(docopt_arguments) - settings = scrapy_settings_manipulation(docopt_arguments) - setup_crawler(docopt_arguments[""], settings, source_loader, docopt_arguments["--attributes"].split(',')) + conf = Configurator() + conf.start_log(docopt_arguments["--log"], docopt_arguments["--verbose"]) + conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"]) + setup_crawler(docopt_arguments[""], conf.scrapy_settings, source_loader, docopt_arguments["--attributes"].split(',')) reactor.run() diff --git a/utils/configurator.py b/utils/configurator.py new file mode 100644 index 0000000..8b7ae8a --- /dev/null +++ b/utils/configurator.py @@ -0,0 +1,43 @@ +from scrapy import log +from scrapy.utils.project import get_project_settings + + +class Configurator: + + def __init__(self): + self.scrapy_settings = get_project_settings() + + + def set_output(self, filename, format): + """ + This function manipulates the Scrapy settings that normally would be set in the settings file. In the Fourmi + project these are command line arguments. + :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments. + """ + + if filename != 'result.*format*': + self.scrapy_settings.overrides["FEED_URI"] = format + elif format == "jsonlines": + self.scrapy_settings.overrides["FEED_URI"] = "results.json" + elif format is not None: + self.scrapy_settings.overrides["FEED_URI"] = "results." + format + + if format is not None: + self.scrapy_settings.overrides["FEED_FORMAT"] = format + + + def start_log(self, logfile, verbose): + """ + This function starts the logging functionality of Scrapy using the settings given by the CLI. + :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments. + """ + if logfile is not None: + if verbose: + log.start(logfile=logfile, logstdout=False, loglevel=log.DEBUG) + else: + log.start(logfile=logfile, logstdout=True, loglevel=log.WARNING) + else: + if verbose: + log.start(logstdout=False, loglevel=log.DEBUG) + else: + log.start(logstdout=True, loglevel=log.WARNING) From d765e7fce43c65ffcb6daac4c2c9005ff47c9ea5 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 8 Jun 2014 12:46:50 +0200 Subject: [PATCH 08/15] Edited the documentation of the functions in the configurator --- utils/configurator.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/utils/configurator.py b/utils/configurator.py index 8b7ae8a..8e2e7e8 100644 --- a/utils/configurator.py +++ b/utils/configurator.py @@ -8,28 +8,30 @@ class Configurator: self.scrapy_settings = get_project_settings() - def set_output(self, filename, format): + def set_output(self, filename, fileformat): """ - This function manipulates the Scrapy settings that normally would be set in the settings file. In the Fourmi - project these are command line arguments. - :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments. + This function manipulates the Scrapy output file settings that normally would be set in the settings file. + In the Fourmi project these are command line arguments. + :param filename: The filename of the file where the output will be put. + :param fileformat: The format in which the output will be. """ if filename != 'result.*format*': - self.scrapy_settings.overrides["FEED_URI"] = format - elif format == "jsonlines": + self.scrapy_settings.overrides["FEED_URI"] = fileformat + elif fileformat == "jsonlines": self.scrapy_settings.overrides["FEED_URI"] = "results.json" - elif format is not None: - self.scrapy_settings.overrides["FEED_URI"] = "results." + format + elif fileformat is not None: + self.scrapy_settings.overrides["FEED_URI"] = "results." + fileformat - if format is not None: - self.scrapy_settings.overrides["FEED_FORMAT"] = format + if fileformat is not None: + self.scrapy_settings.overrides["FEED_FORMAT"] = fileformat def start_log(self, logfile, verbose): """ This function starts the logging functionality of Scrapy using the settings given by the CLI. - :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments. + :param logfile: The location where the logfile will be saved. + :param verbose: A boolean value to switch between loglevels. """ if logfile is not None: if verbose: From 51239dd34262d4279fe0d221e5e26aac3619c66e Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 8 Jun 2014 12:47:57 +0200 Subject: [PATCH 09/15] Added a few lines on the configurator itself. --- utils/configurator.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/utils/configurator.py b/utils/configurator.py index 8e2e7e8..380b647 100644 --- a/utils/configurator.py +++ b/utils/configurator.py @@ -3,6 +3,10 @@ from scrapy.utils.project import get_project_settings class Configurator: + """ + A helper class in the fourmi class. This class is used to process the settings as set + from one of the Fourmi applications. + """ def __init__(self): self.scrapy_settings = get_project_settings() From c4ef75cf57183158412d03c0e45c06b1d6e0d8a0 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 8 Jun 2014 13:00:32 +0200 Subject: [PATCH 10/15] Uniform naming of result file --- fourmi.py | 2 +- utils/configurator.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fourmi.py b/fourmi.py index 30b5a03..a959091 100755 --- a/fourmi.py +++ b/fourmi.py @@ -17,7 +17,7 @@ Options: --version Show version. --verbose Verbose logging output. --log= Save log to an file. - -o --output= Output file [default: result.*format*] + -o --output= Output file [default: results.*format*] -f --format= Output formats (supported: csv, json, jsonlines, xml) [default: jsonlines] --include= Include only sources that match these regular expressions split by a comma. --exclude= Exclude the sources that match these regular expressions split by a comma. diff --git a/utils/configurator.py b/utils/configurator.py index 380b647..2b458b6 100644 --- a/utils/configurator.py +++ b/utils/configurator.py @@ -20,7 +20,7 @@ class Configurator: :param fileformat: The format in which the output will be. """ - if filename != 'result.*format*': + if filename != 'results.*format*': self.scrapy_settings.overrides["FEED_URI"] = fileformat elif fileformat == "jsonlines": self.scrapy_settings.overrides["FEED_URI"] = "results.json" From 683de68fb7cb7943125437b21f6b1350325e93ea Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 8 Jun 2014 13:08:37 +0200 Subject: [PATCH 11/15] Added tests and fixed the output settings --- utils/configurator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/configurator.py b/utils/configurator.py index 2b458b6..90e0320 100644 --- a/utils/configurator.py +++ b/utils/configurator.py @@ -21,7 +21,7 @@ class Configurator: """ if filename != 'results.*format*': - self.scrapy_settings.overrides["FEED_URI"] = fileformat + self.scrapy_settings.overrides["FEED_URI"] = filename elif fileformat == "jsonlines": self.scrapy_settings.overrides["FEED_URI"] = "results.json" elif fileformat is not None: From 351a7d08eae1385243cd3784ddc0cd5d8da41d18 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 8 Jun 2014 13:10:18 +0200 Subject: [PATCH 12/15] Added tests for the configurator --- tests/test_configurator.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 tests/test_configurator.py diff --git a/tests/test_configurator.py b/tests/test_configurator.py new file mode 100644 index 0000000..da79096 --- /dev/null +++ b/tests/test_configurator.py @@ -0,0 +1,27 @@ +import unittest +from utils.configurator import Configurator + + +class TestConfigurator(unittest.TestCase): + + def setUp(self): + self.conf = Configurator() + + def test_set_output(self): + self.conf.set_output(filename="test.txt", fileformat="csv") + self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.txt") + self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv") + + self.conf.set_output("results.*format*", "jsonlines") + self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.json") + self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "jsonlines") + + self.conf.set_output("results.*format*", "csv") + self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.csv") + self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv") + + def test_start_log(self): + self.conf.start_log("test.log", True) + self.conf.start_log("test.log", False) + self.conf.start_log(None, True) + self.conf.start_log(None, False) \ No newline at end of file From a1dd39f92a9fd6860e148360f8ad3e9ca567974a Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 8 Jun 2014 13:12:35 +0200 Subject: [PATCH 13/15] Made CSV our default format, as it's probably the most likely to be used. --- fourmi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fourmi.py b/fourmi.py index a959091..68d221a 100755 --- a/fourmi.py +++ b/fourmi.py @@ -18,7 +18,7 @@ Options: --verbose Verbose logging output. --log= Save log to an file. -o --output= Output file [default: results.*format*] - -f --format= Output formats (supported: csv, json, jsonlines, xml) [default: jsonlines] + -f --format= Output formats (supported: csv, json, jsonlines, xml) [default: csv] --include= Include only sources that match these regular expressions split by a comma. --exclude= Exclude the sources that match these regular expressions split by a comma. """ From efa8d45d9c08120374b8cb34a39574565881f3d8 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 8 Jun 2014 13:28:14 +0200 Subject: [PATCH 14/15] I don't yet know a way to test the start_log function --- tests/test_configurator.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/test_configurator.py b/tests/test_configurator.py index da79096..8cc61ea 100644 --- a/tests/test_configurator.py +++ b/tests/test_configurator.py @@ -20,8 +20,8 @@ class TestConfigurator(unittest.TestCase): self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.csv") self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv") - def test_start_log(self): - self.conf.start_log("test.log", True) - self.conf.start_log("test.log", False) - self.conf.start_log(None, True) - self.conf.start_log(None, False) \ No newline at end of file + # def test_start_log(self): + # self.conf.start_log("test.log", True) + # self.conf.start_log("test.log", False) + # self.conf.start_log(None, True) + # self.conf.start_log(None, False) \ No newline at end of file From 98c3fbc590920e1afbeaf036841791e197c7d5e5 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 8 Jun 2014 13:33:57 +0200 Subject: [PATCH 15/15] Link of to the right page (Github name change) --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 8cb8d10..48b0419 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ # Fourmi -**Master branch**: [![Build Status](https://travis-ci.org/jjdekker/Fourmi.svg?branch=master)](https://travis-ci.org/Recondor/Fourmi) [![Coverage Status](https://img.shields.io/coveralls/jjdekker/Fourmi.svg)](https://coveralls.io/r/Recondor/Fourmi?branch=master) +**Master branch**: [![Build Status](https://travis-ci.org/jjdekker/Fourmi.svg?branch=master)](https://travis-ci.org/jjdekker/Fourmi) [![Coverage Status](https://img.shields.io/coveralls/jjdekker/Fourmi.svg)](https://coveralls.io/r/jjdekker/Fourmi?branch=master) -**Developing branch**: [![Build Status](https://travis-ci.org/jjdekker/Fourmi.svg?branch=develop)](https://travis-ci.org/Recondor/Fourmi) [![Coverage Status](https://img.shields.io/coveralls/jjdekker/Fourmi.svg)](https://coveralls.io/r/Recondor/Fourmi?branch=develop) +**Developing branch**: [![Build Status](https://travis-ci.org/jjdekker/Fourmi.svg?branch=develop)](https://travis-ci.org/jjdekker/Fourmi) [![Coverage Status](https://img.shields.io/coveralls/jjdekker/Fourmi.svg)](https://coveralls.io/r/jjdekker/Fourmi?branch=develop) Fourmi is an web scraper for chemical substances. The program is designed to be used as a search engine to search multiple chemical databases for a specific