Merge branch 'feature/extended-logging' into develop
This commit is contained in:
commit
cd058cab1f
@ -26,9 +26,8 @@ class ChemSpider(Source):
|
||||
structure = 'Chemical-Structure.%s.html'
|
||||
extendedinfo = 'MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token='
|
||||
|
||||
def __init__(self, config={}):
|
||||
def __init__(self, config=None):
|
||||
Source.__init__(self, config)
|
||||
self.cfg = config
|
||||
self.ignore_list = []
|
||||
if 'token' not in self.cfg or self.cfg['token'] == '':
|
||||
log.msg('ChemSpider token not set or empty, search/MassSpec API '
|
||||
@ -37,7 +36,6 @@ class ChemSpider(Source):
|
||||
self.search += self.cfg['token']
|
||||
self.extendedinfo += self.cfg['token']
|
||||
|
||||
|
||||
def parse(self, response):
|
||||
sel = Selector(response)
|
||||
requests = []
|
||||
@ -199,13 +197,14 @@ class ChemSpider(Source):
|
||||
return properties
|
||||
|
||||
def newresult(self, attribute, value, conditions='', source='ChemSpider'):
|
||||
return Result({
|
||||
return Result(
|
||||
{
|
||||
'attribute': attribute,
|
||||
'value': value,
|
||||
'source': source,
|
||||
'reliability': self.cfg['reliability'],
|
||||
'conditions': conditions
|
||||
})
|
||||
})
|
||||
|
||||
def parse_searchrequest(self, response):
|
||||
"""Parse the initial response of the ChemSpider Search API """
|
||||
|
@ -22,12 +22,9 @@ class NIST(Source):
|
||||
|
||||
search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on'
|
||||
|
||||
cfg = {}
|
||||
|
||||
def __init__(self, config={}):
|
||||
def __init__(self, config=None):
|
||||
Source.__init__(self, config)
|
||||
self.ignore_list = set()
|
||||
self.cfg = config
|
||||
|
||||
def parse(self, response):
|
||||
sel = Selector(response)
|
||||
@ -88,7 +85,6 @@ class NIST(Source):
|
||||
InChiKey, CAS number
|
||||
"""
|
||||
ul = sel.xpath('body/ul[li/strong="IUPAC Standard InChI:"]')
|
||||
li = ul.xpath('li')
|
||||
|
||||
raw_synonyms = ul.xpath('li[strong="Other names:"]/text()').extract()
|
||||
for synonym in raw_synonyms[0].strip().split(';\n'):
|
||||
@ -255,12 +251,13 @@ class NIST(Source):
|
||||
return results
|
||||
|
||||
def newresult(self, attribute, value, conditions=''):
|
||||
return Result({
|
||||
'attribute': attribute,
|
||||
'value': value,
|
||||
'source': 'NIST',
|
||||
'reliability': self.cfg['reliability'],
|
||||
'conditions': conditions
|
||||
return Result(
|
||||
{
|
||||
'attribute': attribute,
|
||||
'value': value,
|
||||
'source': 'NIST',
|
||||
'reliability': self.cfg['reliability'],
|
||||
'conditions': conditions
|
||||
})
|
||||
|
||||
def new_compound_request(self, compound):
|
||||
|
@ -1,9 +1,11 @@
|
||||
import re
|
||||
|
||||
from scrapy.http import Request
|
||||
from scrapy import log
|
||||
from source import Source
|
||||
from scrapy.selector import Selector
|
||||
|
||||
from source import Source
|
||||
from FourmiCrawler.items import Result
|
||||
import re
|
||||
|
||||
|
||||
class WikipediaParser(Source):
|
||||
@ -17,11 +19,8 @@ class WikipediaParser(Source):
|
||||
__spider = None
|
||||
searched_compounds = []
|
||||
|
||||
cfg = {}
|
||||
|
||||
def __init__(self, config={}):
|
||||
def __init__(self, config=None):
|
||||
Source.__init__(self, config)
|
||||
self.cfg = config
|
||||
|
||||
def parse(self, response):
|
||||
"""
|
||||
@ -53,7 +52,7 @@ class WikipediaParser(Source):
|
||||
# scrape the chembox (wikipedia template)
|
||||
items = self.parse_chembox(sel, items)
|
||||
|
||||
#scrape the drugbox (wikipedia template)
|
||||
# scrape the drugbox (wikipedia template)
|
||||
items = self.parse_drugbox(sel, items)
|
||||
|
||||
items = filter(lambda a: a['value'] != '', items) # remove items with an empty value
|
||||
@ -123,7 +122,6 @@ class WikipediaParser(Source):
|
||||
level=log.DEBUG)
|
||||
return items
|
||||
|
||||
|
||||
def new_compound_request(self, compound):
|
||||
return Request(url=self.website[:-1] + compound, callback=self.parse)
|
||||
|
||||
@ -161,10 +159,11 @@ class WikipediaParser(Source):
|
||||
return links
|
||||
|
||||
def newresult(self, attribute, value):
|
||||
return Result({
|
||||
'attribute': attribute,
|
||||
'value': value,
|
||||
'source': 'Wikipedia',
|
||||
'reliability': self.cfg['reliability'],
|
||||
'conditions': ''
|
||||
return Result(
|
||||
{
|
||||
'attribute': attribute,
|
||||
'value': value,
|
||||
'source': 'Wikipedia',
|
||||
'reliability': self.cfg['reliability'],
|
||||
'conditions': ''
|
||||
})
|
||||
|
@ -6,10 +6,13 @@ class Source:
|
||||
website = "http://something/*" # Regex of URI's the source is able to parse
|
||||
_spider = None
|
||||
|
||||
def __init__(self, config={}):
|
||||
def __init__(self, config=None):
|
||||
"""
|
||||
Initiation of a new Source
|
||||
"""
|
||||
self.cfg = {}
|
||||
if config is not None:
|
||||
self.cfg = config
|
||||
pass
|
||||
|
||||
def parse(self, response):
|
||||
|
@ -10,7 +10,7 @@ class FourmiSpider(Spider):
|
||||
"""
|
||||
name = "FourmiSpider"
|
||||
|
||||
def __init__(self, compound=None, selected_attributes=[".*"], *args, **kwargs):
|
||||
def __init__(self, compound=None, selected_attributes=None, *args, **kwargs):
|
||||
"""
|
||||
Initiation of the Spider
|
||||
:param compound: compound that will be searched.
|
||||
@ -20,7 +20,10 @@ class FourmiSpider(Spider):
|
||||
self.synonyms = set()
|
||||
super(FourmiSpider, self).__init__(*args, **kwargs)
|
||||
self.synonyms.add(compound)
|
||||
self.selected_attributes = selected_attributes
|
||||
if selected_attributes is None:
|
||||
self.selected_attributes = [".*"]
|
||||
else:
|
||||
self.selected_attributes = selected_attributes
|
||||
|
||||
def parse(self, response):
|
||||
"""
|
||||
|
13
fourmi.py
13
fourmi.py
@ -5,6 +5,7 @@ Fourmi, a web scraper build to search specific information for a given compound
|
||||
Usage:
|
||||
fourmi search <compound>
|
||||
fourmi [options] search <compound>
|
||||
fourmi [-v | -vv | -vvv] [options] search <compound>
|
||||
fourmi [options] [--include=<sourcename> | --exclude=<sourcename>] search <compound>
|
||||
fourmi list
|
||||
fourmi [--include=<sourcename> | --exclude=<sourcename>] list
|
||||
@ -15,7 +16,7 @@ Options:
|
||||
--attributes=<regex> Include only that match these regular expressions split by a comma. [default: .*]
|
||||
-h --help Show this screen.
|
||||
--version Show version.
|
||||
--verbose Verbose logging output.
|
||||
-v Verbose logging output. (Multiple occurrences increase logging level)
|
||||
--log=<file> Save log to an file.
|
||||
-o <file> --output=<file> Output file [default: results.*format*]
|
||||
-f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: csv]
|
||||
@ -25,8 +26,7 @@ Options:
|
||||
|
||||
from twisted.internet import reactor
|
||||
from scrapy.crawler import Crawler
|
||||
from scrapy import log, signals
|
||||
from scrapy.utils.project import get_project_settings
|
||||
from scrapy import signals, log
|
||||
import docopt
|
||||
|
||||
from FourmiCrawler.spider import FourmiSpider
|
||||
@ -58,9 +58,12 @@ def search(docopt_arguments, source_loader):
|
||||
:param source_loader: An initiated SourceLoader object pointed at the directory with the sources.
|
||||
"""
|
||||
conf = Configurator()
|
||||
conf.start_log(docopt_arguments["--log"], docopt_arguments["--verbose"])
|
||||
conf.set_logging(docopt_arguments["--log"], docopt_arguments["-v"])
|
||||
conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"])
|
||||
setup_crawler(docopt_arguments["<compound>"], conf.scrapy_settings, source_loader, docopt_arguments["--attributes"].split(','))
|
||||
setup_crawler(docopt_arguments["<compound>"], conf.scrapy_settings,
|
||||
source_loader, docopt_arguments["--attributes"].split(','))
|
||||
log.start(conf.scrapy_settings.get("LOG_FILE"),
|
||||
conf.scrapy_settings.get("LOG_LEVEL"), conf.scrapy_settings.get("LOG_STDOUT"))
|
||||
reactor.run()
|
||||
|
||||
|
||||
|
@ -1,7 +1,8 @@
|
||||
import unittest
|
||||
import ConfigParser
|
||||
|
||||
from utils.configurator import Configurator
|
||||
|
||||
import ConfigParser
|
||||
|
||||
class TestConfigurator(unittest.TestCase):
|
||||
|
||||
@ -21,11 +22,28 @@ class TestConfigurator(unittest.TestCase):
|
||||
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.csv")
|
||||
self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv")
|
||||
|
||||
# def test_start_log(self):
|
||||
# self.conf.start_log("test.log", True)
|
||||
# self.conf.start_log("test.log", False)
|
||||
# self.conf.start_log(None, True)
|
||||
# self.conf.start_log(None, False)
|
||||
def test_start_log(self):
|
||||
for i in range(0, 3):
|
||||
self.conf.set_logging("TEST", i)
|
||||
self.assertEqual(self.conf.scrapy_settings.get("LOG_FILE"), "TEST")
|
||||
if i > 0:
|
||||
self.assertEqual(self.conf.scrapy_settings.get("LOG_ENABLED"), True)
|
||||
if i > 1:
|
||||
self.assertEqual(self.conf.scrapy_settings.get("LOG_STDOUT"), False)
|
||||
else:
|
||||
self.assertEqual(self.conf.scrapy_settings.get("LOG_STDOUT"), True)
|
||||
else:
|
||||
self.assertEqual(self.conf.scrapy_settings.get("LOG_ENABLED"), False)
|
||||
self.assertEqual(self.conf.scrapy_settings.get("LOG_STDOUT"), True)
|
||||
if i == 1:
|
||||
self.assertEqual(self.conf.scrapy_settings.get("LOG_LEVEL"), "WARNING")
|
||||
elif i == 2:
|
||||
self.assertEqual(self.conf.scrapy_settings.get("LOG_LEVEL"), "INFO")
|
||||
elif i == 3:
|
||||
self.assertEqual(self.conf.scrapy_settings.get("LOG_LEVEL"), "DEBUG")
|
||||
|
||||
self.conf.set_logging(verbose=i)
|
||||
self.assertEqual(self.conf.scrapy_settings.get("LOG_FILE"), None)
|
||||
|
||||
def test_read_sourceconfiguration(self):
|
||||
config = self.conf.read_sourceconfiguration()
|
||||
|
@ -13,6 +13,7 @@ class TestPipelines(unittest.TestCase):
|
||||
def test_none_pipeline(self):
|
||||
# Testing the pipeline that replaces the None values in items.
|
||||
self.testItem["value"] = "abc"
|
||||
self.testItem["source"] = None
|
||||
pipe = pipelines.RemoveNonePipeline()
|
||||
processed = pipe.process_item(self.testItem, spider.FourmiSpider())
|
||||
|
||||
|
@ -47,7 +47,6 @@ class TestFoumiSpider(unittest.TestCase):
|
||||
self.assertGreater(len(requests), 0)
|
||||
self.assertIsInstance(requests[0], Request)
|
||||
|
||||
|
||||
def test_synonym_requests(self):
|
||||
# A test for the synonym request function
|
||||
self.spi._sources = []
|
||||
|
@ -1,7 +1,8 @@
|
||||
from scrapy import log
|
||||
from scrapy.utils.project import get_project_settings
|
||||
import ConfigParser
|
||||
|
||||
from scrapy.utils.project import get_project_settings
|
||||
|
||||
|
||||
class Configurator:
|
||||
"""
|
||||
A helper class in the fourmi class. This class is used to process the settings as set
|
||||
@ -11,7 +12,6 @@ class Configurator:
|
||||
def __init__(self):
|
||||
self.scrapy_settings = get_project_settings()
|
||||
|
||||
|
||||
def set_output(self, filename, fileformat):
|
||||
"""
|
||||
This function manipulates the Scrapy output file settings that normally would be set in the settings file.
|
||||
@ -30,23 +30,34 @@ class Configurator:
|
||||
if fileformat is not None:
|
||||
self.scrapy_settings.overrides["FEED_FORMAT"] = fileformat
|
||||
|
||||
|
||||
def start_log(self, logfile, verbose):
|
||||
def set_logging(self, logfile=None, verbose=0):
|
||||
"""
|
||||
This function starts the logging functionality of Scrapy using the settings given by the CLI.
|
||||
This function changes the default settings of Scapy's logging functionality
|
||||
using the settings given by the CLI.
|
||||
:param logfile: The location where the logfile will be saved.
|
||||
:param verbose: A boolean value to switch between loglevels.
|
||||
:param verbose: A integer value to switch between loglevels.
|
||||
"""
|
||||
if logfile is not None:
|
||||
if verbose:
|
||||
log.start(logfile=logfile, logstdout=False, loglevel=log.DEBUG)
|
||||
else:
|
||||
log.start(logfile=logfile, logstdout=True, loglevel=log.WARNING)
|
||||
if verbose != 0:
|
||||
self.scrapy_settings.overrides["LOG_ENABLED"] = True
|
||||
else:
|
||||
if verbose:
|
||||
log.start(logstdout=False, loglevel=log.DEBUG)
|
||||
else:
|
||||
log.start(logstdout=True, loglevel=log.WARNING)
|
||||
self.scrapy_settings.overrides["LOG_ENABLED"] = False
|
||||
|
||||
if verbose == 1:
|
||||
self.scrapy_settings.overrides["LOG_LEVEL"] = "WARNING"
|
||||
elif verbose == 2:
|
||||
self.scrapy_settings.overrides["LOG_LEVEL"] = "INFO"
|
||||
else:
|
||||
self.scrapy_settings.overrides["LOG_LEVEL"] = "DEBUG"
|
||||
|
||||
if verbose > 1:
|
||||
self.scrapy_settings.overrides["LOG_STDOUT"] = False
|
||||
else:
|
||||
self.scrapy_settings.overrides["LOG_STDOUT"] = True
|
||||
|
||||
if logfile is not None:
|
||||
self.scrapy_settings.overrides["LOG_FILE"] = logfile
|
||||
else:
|
||||
self.scrapy_settings.overrides["LOG_FILE"] = None
|
||||
|
||||
@staticmethod
|
||||
def read_sourceconfiguration():
|
||||
@ -56,7 +67,7 @@ class Configurator:
|
||||
:return a ConfigParser object of sources.cfg
|
||||
"""
|
||||
config = ConfigParser.ConfigParser()
|
||||
config.read('sources.cfg') # [TODO]: should be softcoded eventually
|
||||
config.read('sources.cfg') # [TODO]: should be softcoded eventually
|
||||
return config
|
||||
|
||||
@staticmethod
|
||||
@ -75,7 +86,6 @@ class Configurator:
|
||||
elif config.defaults():
|
||||
section = config.defaults()
|
||||
if 'reliability' not in section:
|
||||
log.msg('Reliability not set for %s' % sourcename,
|
||||
level=log.WARNING)
|
||||
print 'WARNING: Reliability not set for %s' % sourcename
|
||||
section['reliability'] = ''
|
||||
return section
|
||||
|
@ -5,6 +5,7 @@ import re
|
||||
from FourmiCrawler.sources.source import Source
|
||||
from utils.configurator import Configurator
|
||||
|
||||
|
||||
class SourceLoader:
|
||||
sources = []
|
||||
|
||||
|
Reference in New Issue
Block a user