Archived
1
0

Merge branch 'feature/extended-logging' into develop

This commit is contained in:
Jip J. Dekker 2014-06-15 21:06:06 +02:00
commit cd058cab1f
11 changed files with 97 additions and 64 deletions

View File

@ -26,9 +26,8 @@ class ChemSpider(Source):
structure = 'Chemical-Structure.%s.html' structure = 'Chemical-Structure.%s.html'
extendedinfo = 'MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token=' extendedinfo = 'MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token='
def __init__(self, config={}): def __init__(self, config=None):
Source.__init__(self, config) Source.__init__(self, config)
self.cfg = config
self.ignore_list = [] self.ignore_list = []
if 'token' not in self.cfg or self.cfg['token'] == '': if 'token' not in self.cfg or self.cfg['token'] == '':
log.msg('ChemSpider token not set or empty, search/MassSpec API ' log.msg('ChemSpider token not set or empty, search/MassSpec API '
@ -37,7 +36,6 @@ class ChemSpider(Source):
self.search += self.cfg['token'] self.search += self.cfg['token']
self.extendedinfo += self.cfg['token'] self.extendedinfo += self.cfg['token']
def parse(self, response): def parse(self, response):
sel = Selector(response) sel = Selector(response)
requests = [] requests = []
@ -199,7 +197,8 @@ class ChemSpider(Source):
return properties return properties
def newresult(self, attribute, value, conditions='', source='ChemSpider'): def newresult(self, attribute, value, conditions='', source='ChemSpider'):
return Result({ return Result(
{
'attribute': attribute, 'attribute': attribute,
'value': value, 'value': value,
'source': source, 'source': source,

View File

@ -22,12 +22,9 @@ class NIST(Source):
search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on' search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on'
cfg = {} def __init__(self, config=None):
def __init__(self, config={}):
Source.__init__(self, config) Source.__init__(self, config)
self.ignore_list = set() self.ignore_list = set()
self.cfg = config
def parse(self, response): def parse(self, response):
sel = Selector(response) sel = Selector(response)
@ -88,7 +85,6 @@ class NIST(Source):
InChiKey, CAS number InChiKey, CAS number
""" """
ul = sel.xpath('body/ul[li/strong="IUPAC Standard InChI:"]') ul = sel.xpath('body/ul[li/strong="IUPAC Standard InChI:"]')
li = ul.xpath('li')
raw_synonyms = ul.xpath('li[strong="Other names:"]/text()').extract() raw_synonyms = ul.xpath('li[strong="Other names:"]/text()').extract()
for synonym in raw_synonyms[0].strip().split(';\n'): for synonym in raw_synonyms[0].strip().split(';\n'):
@ -255,7 +251,8 @@ class NIST(Source):
return results return results
def newresult(self, attribute, value, conditions=''): def newresult(self, attribute, value, conditions=''):
return Result({ return Result(
{
'attribute': attribute, 'attribute': attribute,
'value': value, 'value': value,
'source': 'NIST', 'source': 'NIST',

View File

@ -1,9 +1,11 @@
import re
from scrapy.http import Request from scrapy.http import Request
from scrapy import log from scrapy import log
from source import Source
from scrapy.selector import Selector from scrapy.selector import Selector
from source import Source
from FourmiCrawler.items import Result from FourmiCrawler.items import Result
import re
class WikipediaParser(Source): class WikipediaParser(Source):
@ -17,11 +19,8 @@ class WikipediaParser(Source):
__spider = None __spider = None
searched_compounds = [] searched_compounds = []
cfg = {} def __init__(self, config=None):
def __init__(self, config={}):
Source.__init__(self, config) Source.__init__(self, config)
self.cfg = config
def parse(self, response): def parse(self, response):
""" """
@ -53,7 +52,7 @@ class WikipediaParser(Source):
# scrape the chembox (wikipedia template) # scrape the chembox (wikipedia template)
items = self.parse_chembox(sel, items) items = self.parse_chembox(sel, items)
#scrape the drugbox (wikipedia template) # scrape the drugbox (wikipedia template)
items = self.parse_drugbox(sel, items) items = self.parse_drugbox(sel, items)
items = filter(lambda a: a['value'] != '', items) # remove items with an empty value items = filter(lambda a: a['value'] != '', items) # remove items with an empty value
@ -123,7 +122,6 @@ class WikipediaParser(Source):
level=log.DEBUG) level=log.DEBUG)
return items return items
def new_compound_request(self, compound): def new_compound_request(self, compound):
return Request(url=self.website[:-1] + compound, callback=self.parse) return Request(url=self.website[:-1] + compound, callback=self.parse)
@ -161,7 +159,8 @@ class WikipediaParser(Source):
return links return links
def newresult(self, attribute, value): def newresult(self, attribute, value):
return Result({ return Result(
{
'attribute': attribute, 'attribute': attribute,
'value': value, 'value': value,
'source': 'Wikipedia', 'source': 'Wikipedia',

View File

@ -6,10 +6,13 @@ class Source:
website = "http://something/*" # Regex of URI's the source is able to parse website = "http://something/*" # Regex of URI's the source is able to parse
_spider = None _spider = None
def __init__(self, config={}): def __init__(self, config=None):
""" """
Initiation of a new Source Initiation of a new Source
""" """
self.cfg = {}
if config is not None:
self.cfg = config
pass pass
def parse(self, response): def parse(self, response):

View File

@ -10,7 +10,7 @@ class FourmiSpider(Spider):
""" """
name = "FourmiSpider" name = "FourmiSpider"
def __init__(self, compound=None, selected_attributes=[".*"], *args, **kwargs): def __init__(self, compound=None, selected_attributes=None, *args, **kwargs):
""" """
Initiation of the Spider Initiation of the Spider
:param compound: compound that will be searched. :param compound: compound that will be searched.
@ -20,6 +20,9 @@ class FourmiSpider(Spider):
self.synonyms = set() self.synonyms = set()
super(FourmiSpider, self).__init__(*args, **kwargs) super(FourmiSpider, self).__init__(*args, **kwargs)
self.synonyms.add(compound) self.synonyms.add(compound)
if selected_attributes is None:
self.selected_attributes = [".*"]
else:
self.selected_attributes = selected_attributes self.selected_attributes = selected_attributes
def parse(self, response): def parse(self, response):

View File

@ -5,6 +5,7 @@ Fourmi, a web scraper build to search specific information for a given compound
Usage: Usage:
fourmi search <compound> fourmi search <compound>
fourmi [options] search <compound> fourmi [options] search <compound>
fourmi [-v | -vv | -vvv] [options] search <compound>
fourmi [options] [--include=<sourcename> | --exclude=<sourcename>] search <compound> fourmi [options] [--include=<sourcename> | --exclude=<sourcename>] search <compound>
fourmi list fourmi list
fourmi [--include=<sourcename> | --exclude=<sourcename>] list fourmi [--include=<sourcename> | --exclude=<sourcename>] list
@ -15,7 +16,7 @@ Options:
--attributes=<regex> Include only that match these regular expressions split by a comma. [default: .*] --attributes=<regex> Include only that match these regular expressions split by a comma. [default: .*]
-h --help Show this screen. -h --help Show this screen.
--version Show version. --version Show version.
--verbose Verbose logging output. -v Verbose logging output. (Multiple occurrences increase logging level)
--log=<file> Save log to an file. --log=<file> Save log to an file.
-o <file> --output=<file> Output file [default: results.*format*] -o <file> --output=<file> Output file [default: results.*format*]
-f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: csv] -f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: csv]
@ -25,8 +26,7 @@ Options:
from twisted.internet import reactor from twisted.internet import reactor
from scrapy.crawler import Crawler from scrapy.crawler import Crawler
from scrapy import log, signals from scrapy import signals, log
from scrapy.utils.project import get_project_settings
import docopt import docopt
from FourmiCrawler.spider import FourmiSpider from FourmiCrawler.spider import FourmiSpider
@ -58,9 +58,12 @@ def search(docopt_arguments, source_loader):
:param source_loader: An initiated SourceLoader object pointed at the directory with the sources. :param source_loader: An initiated SourceLoader object pointed at the directory with the sources.
""" """
conf = Configurator() conf = Configurator()
conf.start_log(docopt_arguments["--log"], docopt_arguments["--verbose"]) conf.set_logging(docopt_arguments["--log"], docopt_arguments["-v"])
conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"]) conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"])
setup_crawler(docopt_arguments["<compound>"], conf.scrapy_settings, source_loader, docopt_arguments["--attributes"].split(',')) setup_crawler(docopt_arguments["<compound>"], conf.scrapy_settings,
source_loader, docopt_arguments["--attributes"].split(','))
log.start(conf.scrapy_settings.get("LOG_FILE"),
conf.scrapy_settings.get("LOG_LEVEL"), conf.scrapy_settings.get("LOG_STDOUT"))
reactor.run() reactor.run()

View File

@ -1,7 +1,8 @@
import unittest import unittest
import ConfigParser
from utils.configurator import Configurator from utils.configurator import Configurator
import ConfigParser
class TestConfigurator(unittest.TestCase): class TestConfigurator(unittest.TestCase):
@ -21,11 +22,28 @@ class TestConfigurator(unittest.TestCase):
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.csv") self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.csv")
self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv") self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv")
# def test_start_log(self): def test_start_log(self):
# self.conf.start_log("test.log", True) for i in range(0, 3):
# self.conf.start_log("test.log", False) self.conf.set_logging("TEST", i)
# self.conf.start_log(None, True) self.assertEqual(self.conf.scrapy_settings.get("LOG_FILE"), "TEST")
# self.conf.start_log(None, False) if i > 0:
self.assertEqual(self.conf.scrapy_settings.get("LOG_ENABLED"), True)
if i > 1:
self.assertEqual(self.conf.scrapy_settings.get("LOG_STDOUT"), False)
else:
self.assertEqual(self.conf.scrapy_settings.get("LOG_STDOUT"), True)
else:
self.assertEqual(self.conf.scrapy_settings.get("LOG_ENABLED"), False)
self.assertEqual(self.conf.scrapy_settings.get("LOG_STDOUT"), True)
if i == 1:
self.assertEqual(self.conf.scrapy_settings.get("LOG_LEVEL"), "WARNING")
elif i == 2:
self.assertEqual(self.conf.scrapy_settings.get("LOG_LEVEL"), "INFO")
elif i == 3:
self.assertEqual(self.conf.scrapy_settings.get("LOG_LEVEL"), "DEBUG")
self.conf.set_logging(verbose=i)
self.assertEqual(self.conf.scrapy_settings.get("LOG_FILE"), None)
def test_read_sourceconfiguration(self): def test_read_sourceconfiguration(self):
config = self.conf.read_sourceconfiguration() config = self.conf.read_sourceconfiguration()

View File

@ -13,6 +13,7 @@ class TestPipelines(unittest.TestCase):
def test_none_pipeline(self): def test_none_pipeline(self):
# Testing the pipeline that replaces the None values in items. # Testing the pipeline that replaces the None values in items.
self.testItem["value"] = "abc" self.testItem["value"] = "abc"
self.testItem["source"] = None
pipe = pipelines.RemoveNonePipeline() pipe = pipelines.RemoveNonePipeline()
processed = pipe.process_item(self.testItem, spider.FourmiSpider()) processed = pipe.process_item(self.testItem, spider.FourmiSpider())

View File

@ -47,7 +47,6 @@ class TestFoumiSpider(unittest.TestCase):
self.assertGreater(len(requests), 0) self.assertGreater(len(requests), 0)
self.assertIsInstance(requests[0], Request) self.assertIsInstance(requests[0], Request)
def test_synonym_requests(self): def test_synonym_requests(self):
# A test for the synonym request function # A test for the synonym request function
self.spi._sources = [] self.spi._sources = []

View File

@ -1,7 +1,8 @@
from scrapy import log
from scrapy.utils.project import get_project_settings
import ConfigParser import ConfigParser
from scrapy.utils.project import get_project_settings
class Configurator: class Configurator:
""" """
A helper class in the fourmi class. This class is used to process the settings as set A helper class in the fourmi class. This class is used to process the settings as set
@ -11,7 +12,6 @@ class Configurator:
def __init__(self): def __init__(self):
self.scrapy_settings = get_project_settings() self.scrapy_settings = get_project_settings()
def set_output(self, filename, fileformat): def set_output(self, filename, fileformat):
""" """
This function manipulates the Scrapy output file settings that normally would be set in the settings file. This function manipulates the Scrapy output file settings that normally would be set in the settings file.
@ -30,23 +30,34 @@ class Configurator:
if fileformat is not None: if fileformat is not None:
self.scrapy_settings.overrides["FEED_FORMAT"] = fileformat self.scrapy_settings.overrides["FEED_FORMAT"] = fileformat
def set_logging(self, logfile=None, verbose=0):
def start_log(self, logfile, verbose):
""" """
This function starts the logging functionality of Scrapy using the settings given by the CLI. This function changes the default settings of Scapy's logging functionality
using the settings given by the CLI.
:param logfile: The location where the logfile will be saved. :param logfile: The location where the logfile will be saved.
:param verbose: A boolean value to switch between loglevels. :param verbose: A integer value to switch between loglevels.
""" """
if verbose != 0:
self.scrapy_settings.overrides["LOG_ENABLED"] = True
else:
self.scrapy_settings.overrides["LOG_ENABLED"] = False
if verbose == 1:
self.scrapy_settings.overrides["LOG_LEVEL"] = "WARNING"
elif verbose == 2:
self.scrapy_settings.overrides["LOG_LEVEL"] = "INFO"
else:
self.scrapy_settings.overrides["LOG_LEVEL"] = "DEBUG"
if verbose > 1:
self.scrapy_settings.overrides["LOG_STDOUT"] = False
else:
self.scrapy_settings.overrides["LOG_STDOUT"] = True
if logfile is not None: if logfile is not None:
if verbose: self.scrapy_settings.overrides["LOG_FILE"] = logfile
log.start(logfile=logfile, logstdout=False, loglevel=log.DEBUG)
else: else:
log.start(logfile=logfile, logstdout=True, loglevel=log.WARNING) self.scrapy_settings.overrides["LOG_FILE"] = None
else:
if verbose:
log.start(logstdout=False, loglevel=log.DEBUG)
else:
log.start(logstdout=True, loglevel=log.WARNING)
@staticmethod @staticmethod
def read_sourceconfiguration(): def read_sourceconfiguration():
@ -75,7 +86,6 @@ class Configurator:
elif config.defaults(): elif config.defaults():
section = config.defaults() section = config.defaults()
if 'reliability' not in section: if 'reliability' not in section:
log.msg('Reliability not set for %s' % sourcename, print 'WARNING: Reliability not set for %s' % sourcename
level=log.WARNING)
section['reliability'] = '' section['reliability'] = ''
return section return section

View File

@ -5,6 +5,7 @@ import re
from FourmiCrawler.sources.source import Source from FourmiCrawler.sources.source import Source
from utils.configurator import Configurator from utils.configurator import Configurator
class SourceLoader: class SourceLoader:
sources = [] sources = []