Archived
1
0

Merge branch 'feature/restructure' into develop

This commit is contained in:
Jip J. Dekker 2014-06-08 13:34:09 +02:00
commit efb7d60079
8 changed files with 90 additions and 50 deletions

View File

@ -10,7 +10,7 @@ install:
# command to run tests, e.g. python setup.py test
script:
- nosetests --with-coverage --cover-package=FourmiCrawler tests
- nosetests --with-coverage --cover-package=FourmiCrawler,utils tests
notifications:
slack: descartes2:6sgCzx3PvrO9IIMwKxj12dDM

View File

@ -1,8 +1,8 @@
# Fourmi
**Master branch**: [![Build Status](https://travis-ci.org/Recondor/Fourmi.svg?branch=master)](https://travis-ci.org/Recondor/Fourmi) [![Coverage Status](https://img.shields.io/coveralls/Recondor/Fourmi.svg)](https://coveralls.io/r/Recondor/Fourmi?branch=master)
**Master branch**: [![Build Status](https://travis-ci.org/jjdekker/Fourmi.svg?branch=master)](https://travis-ci.org/jjdekker/Fourmi) [![Coverage Status](https://img.shields.io/coveralls/jjdekker/Fourmi.svg)](https://coveralls.io/r/jjdekker/Fourmi?branch=master)
**Developing branch**: [![Build Status](https://travis-ci.org/Recondor/Fourmi.svg?branch=develop)](https://travis-ci.org/Recondor/Fourmi) [![Coverage Status](https://img.shields.io/coveralls/Recondor/Fourmi.svg)](https://coveralls.io/r/Recondor/Fourmi?branch=develop)
**Developing branch**: [![Build Status](https://travis-ci.org/jjdekker/Fourmi.svg?branch=develop)](https://travis-ci.org/jjdekker/Fourmi) [![Coverage Status](https://img.shields.io/coveralls/jjdekker/Fourmi.svg)](https://coveralls.io/r/jjdekker/Fourmi?branch=develop)
Fourmi is an web scraper for chemical substances. The program is designed to be
used as a search engine to search multiple chemical databases for a specific

View File

@ -17,8 +17,8 @@ Options:
--version Show version.
--verbose Verbose logging output.
--log=<file> Save log to an file.
-o <file> --output=<file> Output file [default: result.*format*]
-f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: jsonlines]
-o <file> --output=<file> Output file [default: results.*format*]
-f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: csv]
--include=<regex> Include only sources that match these regular expressions split by a comma.
--exclude=<regex> Exclude the sources that match these regular expressions split by a comma.
"""
@ -30,7 +30,8 @@ from scrapy.utils.project import get_project_settings
import docopt
from FourmiCrawler.spider import FourmiSpider
from sourceloader import SourceLoader
from utils.configurator import Configurator
from utils.sourceloader import SourceLoader
def setup_crawler(compound, settings, source_loader, attributes):
@ -50,53 +51,16 @@ def setup_crawler(compound, settings, source_loader, attributes):
crawler.start()
def scrapy_settings_manipulation(docopt_arguments):
"""
This function manipulates the Scrapy settings that normally would be set in the settings file. In the Fourmi
project these are command line arguments.
:param docopt_arguments: A dictionary generated by docopt containing all CLI arguments.
"""
settings = get_project_settings()
if docopt_arguments["--output"] != 'result.*format*':
settings.overrides["FEED_URI"] = docopt_arguments["--output"]
elif docopt_arguments["--format"] == "jsonlines":
settings.overrides["FEED_URI"] = "results.json"
elif docopt_arguments["--format"] is not None:
settings.overrides["FEED_URI"] = "results." + docopt_arguments["--format"]
if docopt_arguments["--format"] is not None:
settings.overrides["FEED_FORMAT"] = docopt_arguments["--format"]
return settings
def start_log(docopt_arguments):
"""
This function starts the logging functionality of Scrapy using the settings given by the CLI.
:param docopt_arguments: A dictionary generated by docopt containing all CLI arguments.
"""
if docopt_arguments["--log"] is not None:
if docopt_arguments["--verbose"]:
log.start(logfile=docopt_arguments["--log"], logstdout=False, loglevel=log.DEBUG)
else:
log.start(logfile=docopt_arguments["--log"], logstdout=True, loglevel=log.WARNING)
else:
if docopt_arguments["--verbose"]:
log.start(logstdout=False, loglevel=log.DEBUG)
else:
log.start(logstdout=True, loglevel=log.WARNING)
def search(docopt_arguments, source_loader):
"""
The function that facilitates the search for a specific compound.
:param docopt_arguments: A dictionary generated by docopt containing all CLI arguments.
:param source_loader: An initiated SourceLoader object pointed at the directory with the sources.
"""
start_log(docopt_arguments)
settings = scrapy_settings_manipulation(docopt_arguments)
setup_crawler(docopt_arguments["<compound>"], settings, source_loader, docopt_arguments["--attributes"].split(','))
conf = Configurator()
conf.start_log(docopt_arguments["--log"], docopt_arguments["--verbose"])
conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"])
setup_crawler(docopt_arguments["<compound>"], conf.scrapy_settings, source_loader, docopt_arguments["--attributes"].split(','))
reactor.run()

View File

@ -0,0 +1,27 @@
import unittest
from utils.configurator import Configurator
class TestConfigurator(unittest.TestCase):
def setUp(self):
self.conf = Configurator()
def test_set_output(self):
self.conf.set_output(filename="test.txt", fileformat="csv")
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.txt")
self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv")
self.conf.set_output("results.*format*", "jsonlines")
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.json")
self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "jsonlines")
self.conf.set_output("results.*format*", "csv")
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.csv")
self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv")
# def test_start_log(self):
# self.conf.start_log("test.log", True)
# self.conf.start_log("test.log", False)
# self.conf.start_log(None, True)
# self.conf.start_log(None, False)

View File

@ -1,6 +1,6 @@
import unittest
from sourceloader import SourceLoader
from utils.sourceloader import SourceLoader
class TestSourceloader(unittest.TestCase):

0
utils/__init__.py Normal file
View File

49
utils/configurator.py Normal file
View File

@ -0,0 +1,49 @@
from scrapy import log
from scrapy.utils.project import get_project_settings
class Configurator:
"""
A helper class in the fourmi class. This class is used to process the settings as set
from one of the Fourmi applications.
"""
def __init__(self):
self.scrapy_settings = get_project_settings()
def set_output(self, filename, fileformat):
"""
This function manipulates the Scrapy output file settings that normally would be set in the settings file.
In the Fourmi project these are command line arguments.
:param filename: The filename of the file where the output will be put.
:param fileformat: The format in which the output will be.
"""
if filename != 'results.*format*':
self.scrapy_settings.overrides["FEED_URI"] = filename
elif fileformat == "jsonlines":
self.scrapy_settings.overrides["FEED_URI"] = "results.json"
elif fileformat is not None:
self.scrapy_settings.overrides["FEED_URI"] = "results." + fileformat
if fileformat is not None:
self.scrapy_settings.overrides["FEED_FORMAT"] = fileformat
def start_log(self, logfile, verbose):
"""
This function starts the logging functionality of Scrapy using the settings given by the CLI.
:param logfile: The location where the logfile will be saved.
:param verbose: A boolean value to switch between loglevels.
"""
if logfile is not None:
if verbose:
log.start(logfile=logfile, logstdout=False, loglevel=log.DEBUG)
else:
log.start(logfile=logfile, logstdout=True, loglevel=log.WARNING)
else:
if verbose:
log.start(logstdout=False, loglevel=log.DEBUG)
else:
log.start(logstdout=True, loglevel=log.WARNING)

View File

@ -8,7 +8,7 @@ from FourmiCrawler.sources.source import Source
class SourceLoader:
sources = []
def __init__(self, rel_dir="FourmiCrawler/sources"):
def __init__(self, rel_dir="../FourmiCrawler/sources"):
"""
The initiation of a SourceLoader, selects and indexes a directory for usable sources.
:param rel_dir: A relative path to a directory.
@ -18,7 +18,7 @@ class SourceLoader:
known_parser = set()
for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']:
mod = __import__('.'.join([rel_dir.replace("/", "."), py]), fromlist=[py])
mod = __import__('.'.join([rel_dir.replace("../", "").replace("/", "."), py]), fromlist=[py])
classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))]
for cls in classes:
if issubclass(cls, Source) and cls not in known_parser: