Merge branch 'feature/extended-logging' into develop

2014-06-15 21:06:06 +02:00 · 2014-06-15 21:06:06 +02:00 · cd058cab1f
commit cd058cab1f
parent ee7f1ab739 147b148dbd
11 changed files with 97 additions and 64 deletions
--- a/FourmiCrawler/sources/ChemSpider.py
+++ b/FourmiCrawler/sources/ChemSpider.py
@ -26,9 +26,8 @@ class ChemSpider(Source):
    structure = 'Chemical-Structure.%s.html'
    extendedinfo = 'MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token='
-    def __init__(self, config={}):
+    def __init__(self, config=None):
        Source.__init__(self, config)
        self.cfg = config
        self.ignore_list = []
        if 'token' not in self.cfg or self.cfg['token'] == '':
            log.msg('ChemSpider token not set or empty, search/MassSpec API '
@ -37,7 +36,6 @@ class ChemSpider(Source):
        self.search += self.cfg['token']
        self.extendedinfo += self.cfg['token']
    def parse(self, response):
        sel = Selector(response)
        requests = []
@ -199,7 +197,8 @@ class ChemSpider(Source):
        return properties
    def newresult(self, attribute, value, conditions='', source='ChemSpider'):
-        return Result({
+        return Result(
            {
                'attribute': attribute,
                'value': value,
                'source': source,
--- a/FourmiCrawler/sources/NIST.py
+++ b/FourmiCrawler/sources/NIST.py
@ -22,12 +22,9 @@ class NIST(Source):
    search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on'
-    cfg = {}
+    def __init__(self, config=None):
    def __init__(self, config={}):
        Source.__init__(self, config)
        self.ignore_list = set()
        self.cfg = config
    def parse(self, response):
        sel = Selector(response)
@ -88,7 +85,6 @@ class NIST(Source):
        InChiKey, CAS number
        """
        ul = sel.xpath('body/ul[li/strong="IUPAC Standard InChI:"]')
        li = ul.xpath('li')
        raw_synonyms = ul.xpath('li[strong="Other names:"]/text()').extract()
        for synonym in raw_synonyms[0].strip().split(';\n'):
@ -255,7 +251,8 @@ class NIST(Source):
        return results
    def newresult(self, attribute, value, conditions=''):
-        return Result({
+        return Result(
            {
                'attribute': attribute,
                'value': value,
                'source': 'NIST',
--- a/FourmiCrawler/sources/WikipediaParser.py
+++ b/FourmiCrawler/sources/WikipediaParser.py
@ -1,9 +1,11 @@
 import re
 from scrapy.http import Request
 from scrapy import log
 from source import Source
 from scrapy.selector import Selector
 from source import Source
 from FourmiCrawler.items import Result
 import re
 class WikipediaParser(Source):
@ -17,11 +19,8 @@ class WikipediaParser(Source):
    __spider = None
    searched_compounds = []
-    cfg = {}
+    def __init__(self, config=None):
    def __init__(self, config={}):
        Source.__init__(self, config)
        self.cfg = config
    def parse(self, response):
        """
@ -53,7 +52,7 @@ class WikipediaParser(Source):
        # scrape the chembox (wikipedia template)
        items = self.parse_chembox(sel, items)
-        #scrape the drugbox (wikipedia template)
+        # scrape the drugbox (wikipedia template)
        items = self.parse_drugbox(sel, items)
        items = filter(lambda a: a['value'] != '', items)  # remove items with an empty value
@ -123,7 +122,6 @@ class WikipediaParser(Source):
                    level=log.DEBUG)
        return items
    def new_compound_request(self, compound):
        return Request(url=self.website[:-1] + compound, callback=self.parse)
@ -161,7 +159,8 @@ class WikipediaParser(Source):
        return links
    def newresult(self, attribute, value):
-        return Result({
+        return Result(
            {
                'attribute': attribute,
                'value': value,
                'source': 'Wikipedia',
--- a/FourmiCrawler/sources/source.py
+++ b/FourmiCrawler/sources/source.py
@ -6,10 +6,13 @@ class Source:
    website = "http://something/*"  # Regex of URI's the source is able to parse
    _spider = None
-    def __init__(self, config={}):
+    def __init__(self, config=None):
        """
        Initiation of a new Source
        """
        self.cfg = {}
        if config is not None:
            self.cfg = config
        pass
    def parse(self, response):
--- a/FourmiCrawler/spider.py
+++ b/FourmiCrawler/spider.py
@ -10,7 +10,7 @@ class FourmiSpider(Spider):
    """
    name = "FourmiSpider"
-    def __init__(self, compound=None, selected_attributes=[".*"], *args, **kwargs):
+    def __init__(self, compound=None, selected_attributes=None, *args, **kwargs):
        """
        Initiation of the Spider
        :param compound: compound that will be searched.
@ -20,6 +20,9 @@ class FourmiSpider(Spider):
        self.synonyms = set()
        super(FourmiSpider, self).__init__(*args, **kwargs)
        self.synonyms.add(compound)
        if selected_attributes is None:
            self.selected_attributes = [".*"]
        else:
            self.selected_attributes = selected_attributes
    def parse(self, response):
--- a/fourmi.py
+++ b/fourmi.py
@ -5,6 +5,7 @@ Fourmi, a web scraper build to search specific information for a given compound
 Usage:
    fourmi search <compound>
    fourmi [options] search <compound>
    fourmi [-v | -vv | -vvv] [options] search <compound>
    fourmi [options] [--include=<sourcename> | --exclude=<sourcename>] search <compound>
    fourmi list
    fourmi [--include=<sourcename> | --exclude=<sourcename>] list
@ -15,7 +16,7 @@ Options:
    --attributes=<regex>            Include only that match these regular expressions split by a comma. [default: .*]
    -h --help                       Show this screen.
    --version                       Show version.
-    --verbose                       Verbose logging output.
+    -v                              Verbose logging output. (Multiple occurrences increase logging level)
    --log=<file>                    Save log to an file.
    -o <file> --output=<file>       Output file [default: results.*format*]
    -f <format> --format=<format>   Output formats (supported: csv, json, jsonlines, xml) [default: csv]
@ -25,8 +26,7 @@ Options:
 from twisted.internet import reactor
 from scrapy.crawler import Crawler
-from scrapy import log, signals
+from scrapy import signals, log
 from scrapy.utils.project import get_project_settings
 import docopt
 from FourmiCrawler.spider import FourmiSpider
@ -58,9 +58,12 @@ def search(docopt_arguments, source_loader):
    :param source_loader: An initiated SourceLoader object pointed at the directory with the sources.
    """
    conf = Configurator()
-    conf.start_log(docopt_arguments["--log"], docopt_arguments["--verbose"])
+    conf.set_logging(docopt_arguments["--log"], docopt_arguments["-v"])
    conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"])
-    setup_crawler(docopt_arguments["<compound>"], conf.scrapy_settings, source_loader, docopt_arguments["--attributes"].split(','))
+    setup_crawler(docopt_arguments["<compound>"], conf.scrapy_settings,
                  source_loader, docopt_arguments["--attributes"].split(','))
    log.start(conf.scrapy_settings.get("LOG_FILE"),
              conf.scrapy_settings.get("LOG_LEVEL"), conf.scrapy_settings.get("LOG_STDOUT"))
    reactor.run()
--- a/tests/test_configurator.py
+++ b/tests/test_configurator.py
@ -1,7 +1,8 @@
 import unittest
 import ConfigParser
 from utils.configurator import Configurator
 import ConfigParser
 class TestConfigurator(unittest.TestCase):
@ -21,11 +22,28 @@ class TestConfigurator(unittest.TestCase):
        self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.csv")
        self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv")
-    # def test_start_log(self):
+    def test_start_log(self):
-    #     self.conf.start_log("test.log", True)
+        for i in range(0, 3):
-    #     self.conf.start_log("test.log", False)
+            self.conf.set_logging("TEST", i)
-    #     self.conf.start_log(None, True)
+            self.assertEqual(self.conf.scrapy_settings.get("LOG_FILE"), "TEST")
-    #     self.conf.start_log(None, False)
+            if i > 0:
                self.assertEqual(self.conf.scrapy_settings.get("LOG_ENABLED"), True)
                if i > 1:
                    self.assertEqual(self.conf.scrapy_settings.get("LOG_STDOUT"), False)
                else:
                    self.assertEqual(self.conf.scrapy_settings.get("LOG_STDOUT"), True)
            else:
                self.assertEqual(self.conf.scrapy_settings.get("LOG_ENABLED"), False)
                self.assertEqual(self.conf.scrapy_settings.get("LOG_STDOUT"), True)
            if i == 1:
                self.assertEqual(self.conf.scrapy_settings.get("LOG_LEVEL"), "WARNING")
            elif i == 2:
                self.assertEqual(self.conf.scrapy_settings.get("LOG_LEVEL"), "INFO")
            elif i == 3:
                self.assertEqual(self.conf.scrapy_settings.get("LOG_LEVEL"), "DEBUG")
            self.conf.set_logging(verbose=i)
            self.assertEqual(self.conf.scrapy_settings.get("LOG_FILE"), None)
    def test_read_sourceconfiguration(self):
        config = self.conf.read_sourceconfiguration()
--- a/tests/test_pipeline.py
+++ b/tests/test_pipeline.py
@ -13,6 +13,7 @@ class TestPipelines(unittest.TestCase):
    def test_none_pipeline(self):
        # Testing the pipeline that replaces the None values in items.
        self.testItem["value"] = "abc"
        self.testItem["source"] = None
        pipe = pipelines.RemoveNonePipeline()
        processed = pipe.process_item(self.testItem, spider.FourmiSpider())
--- a/tests/test_spider.py
+++ b/tests/test_spider.py
@ -47,7 +47,6 @@ class TestFoumiSpider(unittest.TestCase):
        self.assertGreater(len(requests), 0)
        self.assertIsInstance(requests[0], Request)
    def test_synonym_requests(self):
        # A test for the synonym request function
        self.spi._sources = []
--- a/utils/configurator.py
+++ b/utils/configurator.py
@ -1,7 +1,8 @@
 from scrapy import log
 from scrapy.utils.project import get_project_settings
 import ConfigParser
 from scrapy.utils.project import get_project_settings
 class Configurator:
    """
    A helper class in the fourmi class. This class is used to process the settings as set
@ -11,7 +12,6 @@ class Configurator:
    def __init__(self):
        self.scrapy_settings = get_project_settings()
    def set_output(self, filename, fileformat):
        """
        This function manipulates the Scrapy output file settings that normally would be set in the settings file.
@ -30,23 +30,34 @@ class Configurator:
        if fileformat is not None:
            self.scrapy_settings.overrides["FEED_FORMAT"] = fileformat
-
+    def set_logging(self, logfile=None, verbose=0):
    def start_log(self, logfile, verbose):
        """
-        This function starts the logging functionality of Scrapy using the settings given by the CLI.
+        This function changes the default settings of Scapy's logging functionality
        using the settings given by the CLI.
        :param logfile: The location where the logfile will be saved.
-        :param verbose: A boolean value to switch between loglevels.
+        :param verbose: A integer value to switch between loglevels.
        """
        if verbose != 0:
            self.scrapy_settings.overrides["LOG_ENABLED"] = True
        else:
            self.scrapy_settings.overrides["LOG_ENABLED"] = False
        if verbose == 1:
            self.scrapy_settings.overrides["LOG_LEVEL"] = "WARNING"
        elif verbose == 2:
            self.scrapy_settings.overrides["LOG_LEVEL"] = "INFO"
        else:
            self.scrapy_settings.overrides["LOG_LEVEL"] = "DEBUG"
        if verbose > 1:
            self.scrapy_settings.overrides["LOG_STDOUT"] = False
        else:
            self.scrapy_settings.overrides["LOG_STDOUT"] = True
        if logfile is not None:
-            if verbose:
+            self.scrapy_settings.overrides["LOG_FILE"] = logfile
                log.start(logfile=logfile, logstdout=False, loglevel=log.DEBUG)
        else:
-                log.start(logfile=logfile, logstdout=True, loglevel=log.WARNING)
+            self.scrapy_settings.overrides["LOG_FILE"] = None
        else:
            if verbose:
                log.start(logstdout=False, loglevel=log.DEBUG)
            else:
                log.start(logstdout=True, loglevel=log.WARNING)
    @staticmethod
    def read_sourceconfiguration():
@ -75,7 +86,6 @@ class Configurator:
        elif config.defaults():
            section = config.defaults()
        if 'reliability' not in section:
-            log.msg('Reliability not set for %s' % sourcename,
+            print 'WARNING: Reliability not set for %s' % sourcename
                    level=log.WARNING)
            section['reliability'] = ''
        return section
--- a/utils/sourceloader.py
+++ b/utils/sourceloader.py
@ -5,6 +5,7 @@ import re
 from FourmiCrawler.sources.source import Source
 from utils.configurator import Configurator
 class SourceLoader:
    sources = []