Merge branch 'feature/restructure' into develop
This commit is contained in:
commit
efb7d60079
@ -10,7 +10,7 @@ install:
|
||||
|
||||
# command to run tests, e.g. python setup.py test
|
||||
script:
|
||||
- nosetests --with-coverage --cover-package=FourmiCrawler tests
|
||||
- nosetests --with-coverage --cover-package=FourmiCrawler,utils tests
|
||||
|
||||
notifications:
|
||||
slack: descartes2:6sgCzx3PvrO9IIMwKxj12dDM
|
||||
|
@ -1,8 +1,8 @@
|
||||
# Fourmi
|
||||
|
||||
**Master branch**: [](https://travis-ci.org/Recondor/Fourmi) [](https://coveralls.io/r/Recondor/Fourmi?branch=master)
|
||||
**Master branch**: [](https://travis-ci.org/jjdekker/Fourmi) [](https://coveralls.io/r/jjdekker/Fourmi?branch=master)
|
||||
|
||||
**Developing branch**: [](https://travis-ci.org/Recondor/Fourmi) [](https://coveralls.io/r/Recondor/Fourmi?branch=develop)
|
||||
**Developing branch**: [](https://travis-ci.org/jjdekker/Fourmi) [](https://coveralls.io/r/jjdekker/Fourmi?branch=develop)
|
||||
|
||||
Fourmi is an web scraper for chemical substances. The program is designed to be
|
||||
used as a search engine to search multiple chemical databases for a specific
|
||||
|
52
fourmi.py
52
fourmi.py
@ -17,8 +17,8 @@ Options:
|
||||
--version Show version.
|
||||
--verbose Verbose logging output.
|
||||
--log=<file> Save log to an file.
|
||||
-o <file> --output=<file> Output file [default: result.*format*]
|
||||
-f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: jsonlines]
|
||||
-o <file> --output=<file> Output file [default: results.*format*]
|
||||
-f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: csv]
|
||||
--include=<regex> Include only sources that match these regular expressions split by a comma.
|
||||
--exclude=<regex> Exclude the sources that match these regular expressions split by a comma.
|
||||
"""
|
||||
@ -30,7 +30,8 @@ from scrapy.utils.project import get_project_settings
|
||||
import docopt
|
||||
|
||||
from FourmiCrawler.spider import FourmiSpider
|
||||
from sourceloader import SourceLoader
|
||||
from utils.configurator import Configurator
|
||||
from utils.sourceloader import SourceLoader
|
||||
|
||||
|
||||
def setup_crawler(compound, settings, source_loader, attributes):
|
||||
@ -50,53 +51,16 @@ def setup_crawler(compound, settings, source_loader, attributes):
|
||||
crawler.start()
|
||||
|
||||
|
||||
def scrapy_settings_manipulation(docopt_arguments):
|
||||
"""
|
||||
This function manipulates the Scrapy settings that normally would be set in the settings file. In the Fourmi
|
||||
project these are command line arguments.
|
||||
:param docopt_arguments: A dictionary generated by docopt containing all CLI arguments.
|
||||
"""
|
||||
settings = get_project_settings()
|
||||
|
||||
if docopt_arguments["--output"] != 'result.*format*':
|
||||
settings.overrides["FEED_URI"] = docopt_arguments["--output"]
|
||||
elif docopt_arguments["--format"] == "jsonlines":
|
||||
settings.overrides["FEED_URI"] = "results.json"
|
||||
elif docopt_arguments["--format"] is not None:
|
||||
settings.overrides["FEED_URI"] = "results." + docopt_arguments["--format"]
|
||||
|
||||
if docopt_arguments["--format"] is not None:
|
||||
settings.overrides["FEED_FORMAT"] = docopt_arguments["--format"]
|
||||
|
||||
return settings
|
||||
|
||||
|
||||
def start_log(docopt_arguments):
|
||||
"""
|
||||
This function starts the logging functionality of Scrapy using the settings given by the CLI.
|
||||
:param docopt_arguments: A dictionary generated by docopt containing all CLI arguments.
|
||||
"""
|
||||
if docopt_arguments["--log"] is not None:
|
||||
if docopt_arguments["--verbose"]:
|
||||
log.start(logfile=docopt_arguments["--log"], logstdout=False, loglevel=log.DEBUG)
|
||||
else:
|
||||
log.start(logfile=docopt_arguments["--log"], logstdout=True, loglevel=log.WARNING)
|
||||
else:
|
||||
if docopt_arguments["--verbose"]:
|
||||
log.start(logstdout=False, loglevel=log.DEBUG)
|
||||
else:
|
||||
log.start(logstdout=True, loglevel=log.WARNING)
|
||||
|
||||
|
||||
def search(docopt_arguments, source_loader):
|
||||
"""
|
||||
The function that facilitates the search for a specific compound.
|
||||
:param docopt_arguments: A dictionary generated by docopt containing all CLI arguments.
|
||||
:param source_loader: An initiated SourceLoader object pointed at the directory with the sources.
|
||||
"""
|
||||
start_log(docopt_arguments)
|
||||
settings = scrapy_settings_manipulation(docopt_arguments)
|
||||
setup_crawler(docopt_arguments["<compound>"], settings, source_loader, docopt_arguments["--attributes"].split(','))
|
||||
conf = Configurator()
|
||||
conf.start_log(docopt_arguments["--log"], docopt_arguments["--verbose"])
|
||||
conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"])
|
||||
setup_crawler(docopt_arguments["<compound>"], conf.scrapy_settings, source_loader, docopt_arguments["--attributes"].split(','))
|
||||
reactor.run()
|
||||
|
||||
|
||||
|
27
tests/test_configurator.py
Normal file
27
tests/test_configurator.py
Normal file
@ -0,0 +1,27 @@
|
||||
import unittest
|
||||
from utils.configurator import Configurator
|
||||
|
||||
|
||||
class TestConfigurator(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.conf = Configurator()
|
||||
|
||||
def test_set_output(self):
|
||||
self.conf.set_output(filename="test.txt", fileformat="csv")
|
||||
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.txt")
|
||||
self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv")
|
||||
|
||||
self.conf.set_output("results.*format*", "jsonlines")
|
||||
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.json")
|
||||
self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "jsonlines")
|
||||
|
||||
self.conf.set_output("results.*format*", "csv")
|
||||
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.csv")
|
||||
self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv")
|
||||
|
||||
# def test_start_log(self):
|
||||
# self.conf.start_log("test.log", True)
|
||||
# self.conf.start_log("test.log", False)
|
||||
# self.conf.start_log(None, True)
|
||||
# self.conf.start_log(None, False)
|
@ -1,6 +1,6 @@
|
||||
import unittest
|
||||
|
||||
from sourceloader import SourceLoader
|
||||
from utils.sourceloader import SourceLoader
|
||||
|
||||
|
||||
class TestSourceloader(unittest.TestCase):
|
||||
|
0
utils/__init__.py
Normal file
0
utils/__init__.py
Normal file
49
utils/configurator.py
Normal file
49
utils/configurator.py
Normal file
@ -0,0 +1,49 @@
|
||||
from scrapy import log
|
||||
from scrapy.utils.project import get_project_settings
|
||||
|
||||
|
||||
class Configurator:
|
||||
"""
|
||||
A helper class in the fourmi class. This class is used to process the settings as set
|
||||
from one of the Fourmi applications.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.scrapy_settings = get_project_settings()
|
||||
|
||||
|
||||
def set_output(self, filename, fileformat):
|
||||
"""
|
||||
This function manipulates the Scrapy output file settings that normally would be set in the settings file.
|
||||
In the Fourmi project these are command line arguments.
|
||||
:param filename: The filename of the file where the output will be put.
|
||||
:param fileformat: The format in which the output will be.
|
||||
"""
|
||||
|
||||
if filename != 'results.*format*':
|
||||
self.scrapy_settings.overrides["FEED_URI"] = filename
|
||||
elif fileformat == "jsonlines":
|
||||
self.scrapy_settings.overrides["FEED_URI"] = "results.json"
|
||||
elif fileformat is not None:
|
||||
self.scrapy_settings.overrides["FEED_URI"] = "results." + fileformat
|
||||
|
||||
if fileformat is not None:
|
||||
self.scrapy_settings.overrides["FEED_FORMAT"] = fileformat
|
||||
|
||||
|
||||
def start_log(self, logfile, verbose):
|
||||
"""
|
||||
This function starts the logging functionality of Scrapy using the settings given by the CLI.
|
||||
:param logfile: The location where the logfile will be saved.
|
||||
:param verbose: A boolean value to switch between loglevels.
|
||||
"""
|
||||
if logfile is not None:
|
||||
if verbose:
|
||||
log.start(logfile=logfile, logstdout=False, loglevel=log.DEBUG)
|
||||
else:
|
||||
log.start(logfile=logfile, logstdout=True, loglevel=log.WARNING)
|
||||
else:
|
||||
if verbose:
|
||||
log.start(logstdout=False, loglevel=log.DEBUG)
|
||||
else:
|
||||
log.start(logstdout=True, loglevel=log.WARNING)
|
@ -8,7 +8,7 @@ from FourmiCrawler.sources.source import Source
|
||||
class SourceLoader:
|
||||
sources = []
|
||||
|
||||
def __init__(self, rel_dir="FourmiCrawler/sources"):
|
||||
def __init__(self, rel_dir="../FourmiCrawler/sources"):
|
||||
"""
|
||||
The initiation of a SourceLoader, selects and indexes a directory for usable sources.
|
||||
:param rel_dir: A relative path to a directory.
|
||||
@ -18,7 +18,7 @@ class SourceLoader:
|
||||
known_parser = set()
|
||||
|
||||
for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']:
|
||||
mod = __import__('.'.join([rel_dir.replace("/", "."), py]), fromlist=[py])
|
||||
mod = __import__('.'.join([rel_dir.replace("../", "").replace("/", "."), py]), fromlist=[py])
|
||||
classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))]
|
||||
for cls in classes:
|
||||
if issubclass(cls, Source) and cls not in known_parser:
|
Reference in New Issue
Block a user