Merge branch 'develop' into feature/sources-configuration
This commit is contained in:
commit
4f49e5cf8b
@ -10,7 +10,7 @@ install:
|
|||||||
|
|
||||||
# command to run tests, e.g. python setup.py test
|
# command to run tests, e.g. python setup.py test
|
||||||
script:
|
script:
|
||||||
- nosetests --with-coverage --cover-package=FourmiCrawler tests
|
- nosetests --with-coverage --cover-package=FourmiCrawler,utils tests
|
||||||
|
|
||||||
notifications:
|
notifications:
|
||||||
slack: descartes2:6sgCzx3PvrO9IIMwKxj12dDM
|
slack: descartes2:6sgCzx3PvrO9IIMwKxj12dDM
|
||||||
|
@ -1,8 +1,8 @@
|
|||||||
# Fourmi
|
# Fourmi
|
||||||
|
|
||||||
**Master branch**: [](https://travis-ci.org/Recondor/Fourmi)
|
**Master branch**: [](https://travis-ci.org/jjdekker/Fourmi) [](https://coveralls.io/r/jjdekker/Fourmi?branch=master)
|
||||||
|
|
||||||
**Developing branch**: [](https://travis-ci.org/Recondor/Fourmi)
|
**Developing branch**: [](https://travis-ci.org/jjdekker/Fourmi) [](https://coveralls.io/r/jjdekker/Fourmi?branch=develop)
|
||||||
|
|
||||||
Fourmi is an web scraper for chemical substances. The program is designed to be
|
Fourmi is an web scraper for chemical substances. The program is designed to be
|
||||||
used as a search engine to search multiple chemical databases for a specific
|
used as a search engine to search multiple chemical databases for a specific
|
||||||
|
52
fourmi.py
52
fourmi.py
@ -17,8 +17,8 @@ Options:
|
|||||||
--version Show version.
|
--version Show version.
|
||||||
--verbose Verbose logging output.
|
--verbose Verbose logging output.
|
||||||
--log=<file> Save log to an file.
|
--log=<file> Save log to an file.
|
||||||
-o <file> --output=<file> Output file [default: result.*format*]
|
-o <file> --output=<file> Output file [default: results.*format*]
|
||||||
-f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: jsonlines]
|
-f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: csv]
|
||||||
--include=<regex> Include only sources that match these regular expressions split by a comma.
|
--include=<regex> Include only sources that match these regular expressions split by a comma.
|
||||||
--exclude=<regex> Exclude the sources that match these regular expressions split by a comma.
|
--exclude=<regex> Exclude the sources that match these regular expressions split by a comma.
|
||||||
"""
|
"""
|
||||||
@ -30,7 +30,8 @@ from scrapy.utils.project import get_project_settings
|
|||||||
import docopt
|
import docopt
|
||||||
|
|
||||||
from FourmiCrawler.spider import FourmiSpider
|
from FourmiCrawler.spider import FourmiSpider
|
||||||
from sourceloader import SourceLoader
|
from utils.configurator import Configurator
|
||||||
|
from utils.sourceloader import SourceLoader
|
||||||
|
|
||||||
|
|
||||||
def setup_crawler(compound, settings, source_loader, attributes):
|
def setup_crawler(compound, settings, source_loader, attributes):
|
||||||
@ -50,53 +51,16 @@ def setup_crawler(compound, settings, source_loader, attributes):
|
|||||||
crawler.start()
|
crawler.start()
|
||||||
|
|
||||||
|
|
||||||
def scrapy_settings_manipulation(docopt_arguments):
|
|
||||||
"""
|
|
||||||
This function manipulates the Scrapy settings that normally would be set in the settings file. In the Fourmi
|
|
||||||
project these are command line arguments.
|
|
||||||
:param docopt_arguments: A dictionary generated by docopt containing all CLI arguments.
|
|
||||||
"""
|
|
||||||
settings = get_project_settings()
|
|
||||||
|
|
||||||
if docopt_arguments["--output"] != 'result.*format*':
|
|
||||||
settings.overrides["FEED_URI"] = docopt_arguments["--output"]
|
|
||||||
elif docopt_arguments["--format"] == "jsonlines":
|
|
||||||
settings.overrides["FEED_URI"] = "results.json"
|
|
||||||
elif docopt_arguments["--format"] is not None:
|
|
||||||
settings.overrides["FEED_URI"] = "results." + docopt_arguments["--format"]
|
|
||||||
|
|
||||||
if docopt_arguments["--format"] is not None:
|
|
||||||
settings.overrides["FEED_FORMAT"] = docopt_arguments["--format"]
|
|
||||||
|
|
||||||
return settings
|
|
||||||
|
|
||||||
|
|
||||||
def start_log(docopt_arguments):
|
|
||||||
"""
|
|
||||||
This function starts the logging functionality of Scrapy using the settings given by the CLI.
|
|
||||||
:param docopt_arguments: A dictionary generated by docopt containing all CLI arguments.
|
|
||||||
"""
|
|
||||||
if docopt_arguments["--log"] is not None:
|
|
||||||
if docopt_arguments["--verbose"]:
|
|
||||||
log.start(logfile=docopt_arguments["--log"], logstdout=False, loglevel=log.DEBUG)
|
|
||||||
else:
|
|
||||||
log.start(logfile=docopt_arguments["--log"], logstdout=True, loglevel=log.WARNING)
|
|
||||||
else:
|
|
||||||
if docopt_arguments["--verbose"]:
|
|
||||||
log.start(logstdout=False, loglevel=log.DEBUG)
|
|
||||||
else:
|
|
||||||
log.start(logstdout=True, loglevel=log.WARNING)
|
|
||||||
|
|
||||||
|
|
||||||
def search(docopt_arguments, source_loader):
|
def search(docopt_arguments, source_loader):
|
||||||
"""
|
"""
|
||||||
The function that facilitates the search for a specific compound.
|
The function that facilitates the search for a specific compound.
|
||||||
:param docopt_arguments: A dictionary generated by docopt containing all CLI arguments.
|
:param docopt_arguments: A dictionary generated by docopt containing all CLI arguments.
|
||||||
:param source_loader: An initiated SourceLoader object pointed at the directory with the sources.
|
:param source_loader: An initiated SourceLoader object pointed at the directory with the sources.
|
||||||
"""
|
"""
|
||||||
start_log(docopt_arguments)
|
conf = Configurator()
|
||||||
settings = scrapy_settings_manipulation(docopt_arguments)
|
conf.start_log(docopt_arguments["--log"], docopt_arguments["--verbose"])
|
||||||
setup_crawler(docopt_arguments["<compound>"], settings, source_loader, docopt_arguments["--attributes"].split(','))
|
conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"])
|
||||||
|
setup_crawler(docopt_arguments["<compound>"], conf.scrapy_settings, source_loader, docopt_arguments["--attributes"].split(','))
|
||||||
reactor.run()
|
reactor.run()
|
||||||
|
|
||||||
|
|
||||||
|
27
tests/test_configurator.py
Normal file
27
tests/test_configurator.py
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
import unittest
|
||||||
|
from utils.configurator import Configurator
|
||||||
|
|
||||||
|
|
||||||
|
class TestConfigurator(unittest.TestCase):
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
self.conf = Configurator()
|
||||||
|
|
||||||
|
def test_set_output(self):
|
||||||
|
self.conf.set_output(filename="test.txt", fileformat="csv")
|
||||||
|
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.txt")
|
||||||
|
self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv")
|
||||||
|
|
||||||
|
self.conf.set_output("results.*format*", "jsonlines")
|
||||||
|
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.json")
|
||||||
|
self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "jsonlines")
|
||||||
|
|
||||||
|
self.conf.set_output("results.*format*", "csv")
|
||||||
|
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.csv")
|
||||||
|
self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv")
|
||||||
|
|
||||||
|
# def test_start_log(self):
|
||||||
|
# self.conf.start_log("test.log", True)
|
||||||
|
# self.conf.start_log("test.log", False)
|
||||||
|
# self.conf.start_log(None, True)
|
||||||
|
# self.conf.start_log(None, False)
|
@ -1,6 +1,6 @@
|
|||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
from sourceloader import SourceLoader
|
from utils.sourceloader import SourceLoader
|
||||||
|
|
||||||
|
|
||||||
class TestSourceloader(unittest.TestCase):
|
class TestSourceloader(unittest.TestCase):
|
||||||
|
0
utils/__init__.py
Normal file
0
utils/__init__.py
Normal file
49
utils/configurator.py
Normal file
49
utils/configurator.py
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
from scrapy import log
|
||||||
|
from scrapy.utils.project import get_project_settings
|
||||||
|
|
||||||
|
|
||||||
|
class Configurator:
|
||||||
|
"""
|
||||||
|
A helper class in the fourmi class. This class is used to process the settings as set
|
||||||
|
from one of the Fourmi applications.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.scrapy_settings = get_project_settings()
|
||||||
|
|
||||||
|
|
||||||
|
def set_output(self, filename, fileformat):
|
||||||
|
"""
|
||||||
|
This function manipulates the Scrapy output file settings that normally would be set in the settings file.
|
||||||
|
In the Fourmi project these are command line arguments.
|
||||||
|
:param filename: The filename of the file where the output will be put.
|
||||||
|
:param fileformat: The format in which the output will be.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if filename != 'results.*format*':
|
||||||
|
self.scrapy_settings.overrides["FEED_URI"] = filename
|
||||||
|
elif fileformat == "jsonlines":
|
||||||
|
self.scrapy_settings.overrides["FEED_URI"] = "results.json"
|
||||||
|
elif fileformat is not None:
|
||||||
|
self.scrapy_settings.overrides["FEED_URI"] = "results." + fileformat
|
||||||
|
|
||||||
|
if fileformat is not None:
|
||||||
|
self.scrapy_settings.overrides["FEED_FORMAT"] = fileformat
|
||||||
|
|
||||||
|
|
||||||
|
def start_log(self, logfile, verbose):
|
||||||
|
"""
|
||||||
|
This function starts the logging functionality of Scrapy using the settings given by the CLI.
|
||||||
|
:param logfile: The location where the logfile will be saved.
|
||||||
|
:param verbose: A boolean value to switch between loglevels.
|
||||||
|
"""
|
||||||
|
if logfile is not None:
|
||||||
|
if verbose:
|
||||||
|
log.start(logfile=logfile, logstdout=False, loglevel=log.DEBUG)
|
||||||
|
else:
|
||||||
|
log.start(logfile=logfile, logstdout=True, loglevel=log.WARNING)
|
||||||
|
else:
|
||||||
|
if verbose:
|
||||||
|
log.start(logstdout=False, loglevel=log.DEBUG)
|
||||||
|
else:
|
||||||
|
log.start(logstdout=True, loglevel=log.WARNING)
|
@ -9,7 +9,7 @@ from FourmiCrawler.sources.source import Source
|
|||||||
class SourceLoader:
|
class SourceLoader:
|
||||||
sources = []
|
sources = []
|
||||||
|
|
||||||
def __init__(self, rel_dir="FourmiCrawler/sources"):
|
def __init__(self, rel_dir="../FourmiCrawler/sources"):
|
||||||
"""
|
"""
|
||||||
The initiation of a SourceLoader, selects and indexes a directory for usable sources.
|
The initiation of a SourceLoader, selects and indexes a directory for usable sources.
|
||||||
Also loads a configuration file for Sources and passes the arguments in
|
Also loads a configuration file for Sources and passes the arguments in
|
||||||
@ -24,7 +24,7 @@ class SourceLoader:
|
|||||||
config.read('sources.cfg')
|
config.read('sources.cfg')
|
||||||
|
|
||||||
for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']:
|
for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']:
|
||||||
mod = __import__('.'.join([rel_dir.replace("/", "."), py]), fromlist=[py])
|
mod = __import__('.'.join([rel_dir.replace("../", "").replace("/", "."), py]), fromlist=[py])
|
||||||
classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))]
|
classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))]
|
||||||
for cls in classes:
|
for cls in classes:
|
||||||
if issubclass(cls, Source) and cls not in known_parser:
|
if issubclass(cls, Source) and cls not in known_parser:
|
Reference in New Issue
Block a user