Archived
1
0

Merge branch 'feature/extended-logging' into develop

This commit is contained in:
Jip J. Dekker 2014-06-15 21:06:06 +02:00
commit cd058cab1f
11 changed files with 97 additions and 64 deletions

View File

@ -26,9 +26,8 @@ class ChemSpider(Source):
structure = 'Chemical-Structure.%s.html'
extendedinfo = 'MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token='
def __init__(self, config={}):
def __init__(self, config=None):
Source.__init__(self, config)
self.cfg = config
self.ignore_list = []
if 'token' not in self.cfg or self.cfg['token'] == '':
log.msg('ChemSpider token not set or empty, search/MassSpec API '
@ -37,7 +36,6 @@ class ChemSpider(Source):
self.search += self.cfg['token']
self.extendedinfo += self.cfg['token']
def parse(self, response):
sel = Selector(response)
requests = []
@ -199,7 +197,8 @@ class ChemSpider(Source):
return properties
def newresult(self, attribute, value, conditions='', source='ChemSpider'):
return Result({
return Result(
{
'attribute': attribute,
'value': value,
'source': source,

View File

@ -22,12 +22,9 @@ class NIST(Source):
search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on'
cfg = {}
def __init__(self, config={}):
def __init__(self, config=None):
Source.__init__(self, config)
self.ignore_list = set()
self.cfg = config
def parse(self, response):
sel = Selector(response)
@ -88,7 +85,6 @@ class NIST(Source):
InChiKey, CAS number
"""
ul = sel.xpath('body/ul[li/strong="IUPAC Standard InChI:"]')
li = ul.xpath('li')
raw_synonyms = ul.xpath('li[strong="Other names:"]/text()').extract()
for synonym in raw_synonyms[0].strip().split(';\n'):
@ -255,7 +251,8 @@ class NIST(Source):
return results
def newresult(self, attribute, value, conditions=''):
return Result({
return Result(
{
'attribute': attribute,
'value': value,
'source': 'NIST',

View File

@ -1,9 +1,11 @@
import re
from scrapy.http import Request
from scrapy import log
from source import Source
from scrapy.selector import Selector
from source import Source
from FourmiCrawler.items import Result
import re
class WikipediaParser(Source):
@ -17,11 +19,8 @@ class WikipediaParser(Source):
__spider = None
searched_compounds = []
cfg = {}
def __init__(self, config={}):
def __init__(self, config=None):
Source.__init__(self, config)
self.cfg = config
def parse(self, response):
"""
@ -123,7 +122,6 @@ class WikipediaParser(Source):
level=log.DEBUG)
return items
def new_compound_request(self, compound):
return Request(url=self.website[:-1] + compound, callback=self.parse)
@ -161,7 +159,8 @@ class WikipediaParser(Source):
return links
def newresult(self, attribute, value):
return Result({
return Result(
{
'attribute': attribute,
'value': value,
'source': 'Wikipedia',

View File

@ -6,10 +6,13 @@ class Source:
website = "http://something/*" # Regex of URI's the source is able to parse
_spider = None
def __init__(self, config={}):
def __init__(self, config=None):
"""
Initiation of a new Source
"""
self.cfg = {}
if config is not None:
self.cfg = config
pass
def parse(self, response):

View File

@ -10,7 +10,7 @@ class FourmiSpider(Spider):
"""
name = "FourmiSpider"
def __init__(self, compound=None, selected_attributes=[".*"], *args, **kwargs):
def __init__(self, compound=None, selected_attributes=None, *args, **kwargs):
"""
Initiation of the Spider
:param compound: compound that will be searched.
@ -20,6 +20,9 @@ class FourmiSpider(Spider):
self.synonyms = set()
super(FourmiSpider, self).__init__(*args, **kwargs)
self.synonyms.add(compound)
if selected_attributes is None:
self.selected_attributes = [".*"]
else:
self.selected_attributes = selected_attributes
def parse(self, response):

View File

@ -5,6 +5,7 @@ Fourmi, a web scraper build to search specific information for a given compound
Usage:
fourmi search <compound>
fourmi [options] search <compound>
fourmi [-v | -vv | -vvv] [options] search <compound>
fourmi [options] [--include=<sourcename> | --exclude=<sourcename>] search <compound>
fourmi list
fourmi [--include=<sourcename> | --exclude=<sourcename>] list
@ -15,7 +16,7 @@ Options:
--attributes=<regex> Include only that match these regular expressions split by a comma. [default: .*]
-h --help Show this screen.
--version Show version.
--verbose Verbose logging output.
-v Verbose logging output. (Multiple occurrences increase logging level)
--log=<file> Save log to an file.
-o <file> --output=<file> Output file [default: results.*format*]
-f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: csv]
@ -25,8 +26,7 @@ Options:
from twisted.internet import reactor
from scrapy.crawler import Crawler
from scrapy import log, signals
from scrapy.utils.project import get_project_settings
from scrapy import signals, log
import docopt
from FourmiCrawler.spider import FourmiSpider
@ -58,9 +58,12 @@ def search(docopt_arguments, source_loader):
:param source_loader: An initiated SourceLoader object pointed at the directory with the sources.
"""
conf = Configurator()
conf.start_log(docopt_arguments["--log"], docopt_arguments["--verbose"])
conf.set_logging(docopt_arguments["--log"], docopt_arguments["-v"])
conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"])
setup_crawler(docopt_arguments["<compound>"], conf.scrapy_settings, source_loader, docopt_arguments["--attributes"].split(','))
setup_crawler(docopt_arguments["<compound>"], conf.scrapy_settings,
source_loader, docopt_arguments["--attributes"].split(','))
log.start(conf.scrapy_settings.get("LOG_FILE"),
conf.scrapy_settings.get("LOG_LEVEL"), conf.scrapy_settings.get("LOG_STDOUT"))
reactor.run()

View File

@ -1,7 +1,8 @@
import unittest
import ConfigParser
from utils.configurator import Configurator
import ConfigParser
class TestConfigurator(unittest.TestCase):
@ -21,11 +22,28 @@ class TestConfigurator(unittest.TestCase):
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.csv")
self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv")
# def test_start_log(self):
# self.conf.start_log("test.log", True)
# self.conf.start_log("test.log", False)
# self.conf.start_log(None, True)
# self.conf.start_log(None, False)
def test_start_log(self):
for i in range(0, 3):
self.conf.set_logging("TEST", i)
self.assertEqual(self.conf.scrapy_settings.get("LOG_FILE"), "TEST")
if i > 0:
self.assertEqual(self.conf.scrapy_settings.get("LOG_ENABLED"), True)
if i > 1:
self.assertEqual(self.conf.scrapy_settings.get("LOG_STDOUT"), False)
else:
self.assertEqual(self.conf.scrapy_settings.get("LOG_STDOUT"), True)
else:
self.assertEqual(self.conf.scrapy_settings.get("LOG_ENABLED"), False)
self.assertEqual(self.conf.scrapy_settings.get("LOG_STDOUT"), True)
if i == 1:
self.assertEqual(self.conf.scrapy_settings.get("LOG_LEVEL"), "WARNING")
elif i == 2:
self.assertEqual(self.conf.scrapy_settings.get("LOG_LEVEL"), "INFO")
elif i == 3:
self.assertEqual(self.conf.scrapy_settings.get("LOG_LEVEL"), "DEBUG")
self.conf.set_logging(verbose=i)
self.assertEqual(self.conf.scrapy_settings.get("LOG_FILE"), None)
def test_read_sourceconfiguration(self):
config = self.conf.read_sourceconfiguration()

View File

@ -13,6 +13,7 @@ class TestPipelines(unittest.TestCase):
def test_none_pipeline(self):
# Testing the pipeline that replaces the None values in items.
self.testItem["value"] = "abc"
self.testItem["source"] = None
pipe = pipelines.RemoveNonePipeline()
processed = pipe.process_item(self.testItem, spider.FourmiSpider())

View File

@ -47,7 +47,6 @@ class TestFoumiSpider(unittest.TestCase):
self.assertGreater(len(requests), 0)
self.assertIsInstance(requests[0], Request)
def test_synonym_requests(self):
# A test for the synonym request function
self.spi._sources = []

View File

@ -1,7 +1,8 @@
from scrapy import log
from scrapy.utils.project import get_project_settings
import ConfigParser
from scrapy.utils.project import get_project_settings
class Configurator:
"""
A helper class in the fourmi class. This class is used to process the settings as set
@ -11,7 +12,6 @@ class Configurator:
def __init__(self):
self.scrapy_settings = get_project_settings()
def set_output(self, filename, fileformat):
"""
This function manipulates the Scrapy output file settings that normally would be set in the settings file.
@ -30,23 +30,34 @@ class Configurator:
if fileformat is not None:
self.scrapy_settings.overrides["FEED_FORMAT"] = fileformat
def start_log(self, logfile, verbose):
def set_logging(self, logfile=None, verbose=0):
"""
This function starts the logging functionality of Scrapy using the settings given by the CLI.
This function changes the default settings of Scapy's logging functionality
using the settings given by the CLI.
:param logfile: The location where the logfile will be saved.
:param verbose: A boolean value to switch between loglevels.
:param verbose: A integer value to switch between loglevels.
"""
if verbose != 0:
self.scrapy_settings.overrides["LOG_ENABLED"] = True
else:
self.scrapy_settings.overrides["LOG_ENABLED"] = False
if verbose == 1:
self.scrapy_settings.overrides["LOG_LEVEL"] = "WARNING"
elif verbose == 2:
self.scrapy_settings.overrides["LOG_LEVEL"] = "INFO"
else:
self.scrapy_settings.overrides["LOG_LEVEL"] = "DEBUG"
if verbose > 1:
self.scrapy_settings.overrides["LOG_STDOUT"] = False
else:
self.scrapy_settings.overrides["LOG_STDOUT"] = True
if logfile is not None:
if verbose:
log.start(logfile=logfile, logstdout=False, loglevel=log.DEBUG)
self.scrapy_settings.overrides["LOG_FILE"] = logfile
else:
log.start(logfile=logfile, logstdout=True, loglevel=log.WARNING)
else:
if verbose:
log.start(logstdout=False, loglevel=log.DEBUG)
else:
log.start(logstdout=True, loglevel=log.WARNING)
self.scrapy_settings.overrides["LOG_FILE"] = None
@staticmethod
def read_sourceconfiguration():
@ -75,7 +86,6 @@ class Configurator:
elif config.defaults():
section = config.defaults()
if 'reliability' not in section:
log.msg('Reliability not set for %s' % sourcename,
level=log.WARNING)
print 'WARNING: Reliability not set for %s' % sourcename
section['reliability'] = ''
return section

View File

@ -5,6 +5,7 @@ import re
from FourmiCrawler.sources.source import Source
from utils.configurator import Configurator
class SourceLoader:
sources = []