Merge branch 'feature/extended-logging' into develop
This commit is contained in:
commit
cd058cab1f
@ -26,9 +26,8 @@ class ChemSpider(Source):
|
|||||||
structure = 'Chemical-Structure.%s.html'
|
structure = 'Chemical-Structure.%s.html'
|
||||||
extendedinfo = 'MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token='
|
extendedinfo = 'MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token='
|
||||||
|
|
||||||
def __init__(self, config={}):
|
def __init__(self, config=None):
|
||||||
Source.__init__(self, config)
|
Source.__init__(self, config)
|
||||||
self.cfg = config
|
|
||||||
self.ignore_list = []
|
self.ignore_list = []
|
||||||
if 'token' not in self.cfg or self.cfg['token'] == '':
|
if 'token' not in self.cfg or self.cfg['token'] == '':
|
||||||
log.msg('ChemSpider token not set or empty, search/MassSpec API '
|
log.msg('ChemSpider token not set or empty, search/MassSpec API '
|
||||||
@ -37,7 +36,6 @@ class ChemSpider(Source):
|
|||||||
self.search += self.cfg['token']
|
self.search += self.cfg['token']
|
||||||
self.extendedinfo += self.cfg['token']
|
self.extendedinfo += self.cfg['token']
|
||||||
|
|
||||||
|
|
||||||
def parse(self, response):
|
def parse(self, response):
|
||||||
sel = Selector(response)
|
sel = Selector(response)
|
||||||
requests = []
|
requests = []
|
||||||
@ -199,7 +197,8 @@ class ChemSpider(Source):
|
|||||||
return properties
|
return properties
|
||||||
|
|
||||||
def newresult(self, attribute, value, conditions='', source='ChemSpider'):
|
def newresult(self, attribute, value, conditions='', source='ChemSpider'):
|
||||||
return Result({
|
return Result(
|
||||||
|
{
|
||||||
'attribute': attribute,
|
'attribute': attribute,
|
||||||
'value': value,
|
'value': value,
|
||||||
'source': source,
|
'source': source,
|
||||||
|
@ -22,12 +22,9 @@ class NIST(Source):
|
|||||||
|
|
||||||
search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on'
|
search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on'
|
||||||
|
|
||||||
cfg = {}
|
def __init__(self, config=None):
|
||||||
|
|
||||||
def __init__(self, config={}):
|
|
||||||
Source.__init__(self, config)
|
Source.__init__(self, config)
|
||||||
self.ignore_list = set()
|
self.ignore_list = set()
|
||||||
self.cfg = config
|
|
||||||
|
|
||||||
def parse(self, response):
|
def parse(self, response):
|
||||||
sel = Selector(response)
|
sel = Selector(response)
|
||||||
@ -88,7 +85,6 @@ class NIST(Source):
|
|||||||
InChiKey, CAS number
|
InChiKey, CAS number
|
||||||
"""
|
"""
|
||||||
ul = sel.xpath('body/ul[li/strong="IUPAC Standard InChI:"]')
|
ul = sel.xpath('body/ul[li/strong="IUPAC Standard InChI:"]')
|
||||||
li = ul.xpath('li')
|
|
||||||
|
|
||||||
raw_synonyms = ul.xpath('li[strong="Other names:"]/text()').extract()
|
raw_synonyms = ul.xpath('li[strong="Other names:"]/text()').extract()
|
||||||
for synonym in raw_synonyms[0].strip().split(';\n'):
|
for synonym in raw_synonyms[0].strip().split(';\n'):
|
||||||
@ -255,7 +251,8 @@ class NIST(Source):
|
|||||||
return results
|
return results
|
||||||
|
|
||||||
def newresult(self, attribute, value, conditions=''):
|
def newresult(self, attribute, value, conditions=''):
|
||||||
return Result({
|
return Result(
|
||||||
|
{
|
||||||
'attribute': attribute,
|
'attribute': attribute,
|
||||||
'value': value,
|
'value': value,
|
||||||
'source': 'NIST',
|
'source': 'NIST',
|
||||||
|
@ -1,9 +1,11 @@
|
|||||||
|
import re
|
||||||
|
|
||||||
from scrapy.http import Request
|
from scrapy.http import Request
|
||||||
from scrapy import log
|
from scrapy import log
|
||||||
from source import Source
|
|
||||||
from scrapy.selector import Selector
|
from scrapy.selector import Selector
|
||||||
|
|
||||||
|
from source import Source
|
||||||
from FourmiCrawler.items import Result
|
from FourmiCrawler.items import Result
|
||||||
import re
|
|
||||||
|
|
||||||
|
|
||||||
class WikipediaParser(Source):
|
class WikipediaParser(Source):
|
||||||
@ -17,11 +19,8 @@ class WikipediaParser(Source):
|
|||||||
__spider = None
|
__spider = None
|
||||||
searched_compounds = []
|
searched_compounds = []
|
||||||
|
|
||||||
cfg = {}
|
def __init__(self, config=None):
|
||||||
|
|
||||||
def __init__(self, config={}):
|
|
||||||
Source.__init__(self, config)
|
Source.__init__(self, config)
|
||||||
self.cfg = config
|
|
||||||
|
|
||||||
def parse(self, response):
|
def parse(self, response):
|
||||||
"""
|
"""
|
||||||
@ -123,7 +122,6 @@ class WikipediaParser(Source):
|
|||||||
level=log.DEBUG)
|
level=log.DEBUG)
|
||||||
return items
|
return items
|
||||||
|
|
||||||
|
|
||||||
def new_compound_request(self, compound):
|
def new_compound_request(self, compound):
|
||||||
return Request(url=self.website[:-1] + compound, callback=self.parse)
|
return Request(url=self.website[:-1] + compound, callback=self.parse)
|
||||||
|
|
||||||
@ -161,7 +159,8 @@ class WikipediaParser(Source):
|
|||||||
return links
|
return links
|
||||||
|
|
||||||
def newresult(self, attribute, value):
|
def newresult(self, attribute, value):
|
||||||
return Result({
|
return Result(
|
||||||
|
{
|
||||||
'attribute': attribute,
|
'attribute': attribute,
|
||||||
'value': value,
|
'value': value,
|
||||||
'source': 'Wikipedia',
|
'source': 'Wikipedia',
|
||||||
|
@ -6,10 +6,13 @@ class Source:
|
|||||||
website = "http://something/*" # Regex of URI's the source is able to parse
|
website = "http://something/*" # Regex of URI's the source is able to parse
|
||||||
_spider = None
|
_spider = None
|
||||||
|
|
||||||
def __init__(self, config={}):
|
def __init__(self, config=None):
|
||||||
"""
|
"""
|
||||||
Initiation of a new Source
|
Initiation of a new Source
|
||||||
"""
|
"""
|
||||||
|
self.cfg = {}
|
||||||
|
if config is not None:
|
||||||
|
self.cfg = config
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def parse(self, response):
|
def parse(self, response):
|
||||||
|
@ -10,7 +10,7 @@ class FourmiSpider(Spider):
|
|||||||
"""
|
"""
|
||||||
name = "FourmiSpider"
|
name = "FourmiSpider"
|
||||||
|
|
||||||
def __init__(self, compound=None, selected_attributes=[".*"], *args, **kwargs):
|
def __init__(self, compound=None, selected_attributes=None, *args, **kwargs):
|
||||||
"""
|
"""
|
||||||
Initiation of the Spider
|
Initiation of the Spider
|
||||||
:param compound: compound that will be searched.
|
:param compound: compound that will be searched.
|
||||||
@ -20,6 +20,9 @@ class FourmiSpider(Spider):
|
|||||||
self.synonyms = set()
|
self.synonyms = set()
|
||||||
super(FourmiSpider, self).__init__(*args, **kwargs)
|
super(FourmiSpider, self).__init__(*args, **kwargs)
|
||||||
self.synonyms.add(compound)
|
self.synonyms.add(compound)
|
||||||
|
if selected_attributes is None:
|
||||||
|
self.selected_attributes = [".*"]
|
||||||
|
else:
|
||||||
self.selected_attributes = selected_attributes
|
self.selected_attributes = selected_attributes
|
||||||
|
|
||||||
def parse(self, response):
|
def parse(self, response):
|
||||||
|
13
fourmi.py
13
fourmi.py
@ -5,6 +5,7 @@ Fourmi, a web scraper build to search specific information for a given compound
|
|||||||
Usage:
|
Usage:
|
||||||
fourmi search <compound>
|
fourmi search <compound>
|
||||||
fourmi [options] search <compound>
|
fourmi [options] search <compound>
|
||||||
|
fourmi [-v | -vv | -vvv] [options] search <compound>
|
||||||
fourmi [options] [--include=<sourcename> | --exclude=<sourcename>] search <compound>
|
fourmi [options] [--include=<sourcename> | --exclude=<sourcename>] search <compound>
|
||||||
fourmi list
|
fourmi list
|
||||||
fourmi [--include=<sourcename> | --exclude=<sourcename>] list
|
fourmi [--include=<sourcename> | --exclude=<sourcename>] list
|
||||||
@ -15,7 +16,7 @@ Options:
|
|||||||
--attributes=<regex> Include only that match these regular expressions split by a comma. [default: .*]
|
--attributes=<regex> Include only that match these regular expressions split by a comma. [default: .*]
|
||||||
-h --help Show this screen.
|
-h --help Show this screen.
|
||||||
--version Show version.
|
--version Show version.
|
||||||
--verbose Verbose logging output.
|
-v Verbose logging output. (Multiple occurrences increase logging level)
|
||||||
--log=<file> Save log to an file.
|
--log=<file> Save log to an file.
|
||||||
-o <file> --output=<file> Output file [default: results.*format*]
|
-o <file> --output=<file> Output file [default: results.*format*]
|
||||||
-f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: csv]
|
-f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: csv]
|
||||||
@ -25,8 +26,7 @@ Options:
|
|||||||
|
|
||||||
from twisted.internet import reactor
|
from twisted.internet import reactor
|
||||||
from scrapy.crawler import Crawler
|
from scrapy.crawler import Crawler
|
||||||
from scrapy import log, signals
|
from scrapy import signals, log
|
||||||
from scrapy.utils.project import get_project_settings
|
|
||||||
import docopt
|
import docopt
|
||||||
|
|
||||||
from FourmiCrawler.spider import FourmiSpider
|
from FourmiCrawler.spider import FourmiSpider
|
||||||
@ -58,9 +58,12 @@ def search(docopt_arguments, source_loader):
|
|||||||
:param source_loader: An initiated SourceLoader object pointed at the directory with the sources.
|
:param source_loader: An initiated SourceLoader object pointed at the directory with the sources.
|
||||||
"""
|
"""
|
||||||
conf = Configurator()
|
conf = Configurator()
|
||||||
conf.start_log(docopt_arguments["--log"], docopt_arguments["--verbose"])
|
conf.set_logging(docopt_arguments["--log"], docopt_arguments["-v"])
|
||||||
conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"])
|
conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"])
|
||||||
setup_crawler(docopt_arguments["<compound>"], conf.scrapy_settings, source_loader, docopt_arguments["--attributes"].split(','))
|
setup_crawler(docopt_arguments["<compound>"], conf.scrapy_settings,
|
||||||
|
source_loader, docopt_arguments["--attributes"].split(','))
|
||||||
|
log.start(conf.scrapy_settings.get("LOG_FILE"),
|
||||||
|
conf.scrapy_settings.get("LOG_LEVEL"), conf.scrapy_settings.get("LOG_STDOUT"))
|
||||||
reactor.run()
|
reactor.run()
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,7 +1,8 @@
|
|||||||
import unittest
|
import unittest
|
||||||
|
import ConfigParser
|
||||||
|
|
||||||
from utils.configurator import Configurator
|
from utils.configurator import Configurator
|
||||||
|
|
||||||
import ConfigParser
|
|
||||||
|
|
||||||
class TestConfigurator(unittest.TestCase):
|
class TestConfigurator(unittest.TestCase):
|
||||||
|
|
||||||
@ -21,11 +22,28 @@ class TestConfigurator(unittest.TestCase):
|
|||||||
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.csv")
|
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.csv")
|
||||||
self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv")
|
self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv")
|
||||||
|
|
||||||
# def test_start_log(self):
|
def test_start_log(self):
|
||||||
# self.conf.start_log("test.log", True)
|
for i in range(0, 3):
|
||||||
# self.conf.start_log("test.log", False)
|
self.conf.set_logging("TEST", i)
|
||||||
# self.conf.start_log(None, True)
|
self.assertEqual(self.conf.scrapy_settings.get("LOG_FILE"), "TEST")
|
||||||
# self.conf.start_log(None, False)
|
if i > 0:
|
||||||
|
self.assertEqual(self.conf.scrapy_settings.get("LOG_ENABLED"), True)
|
||||||
|
if i > 1:
|
||||||
|
self.assertEqual(self.conf.scrapy_settings.get("LOG_STDOUT"), False)
|
||||||
|
else:
|
||||||
|
self.assertEqual(self.conf.scrapy_settings.get("LOG_STDOUT"), True)
|
||||||
|
else:
|
||||||
|
self.assertEqual(self.conf.scrapy_settings.get("LOG_ENABLED"), False)
|
||||||
|
self.assertEqual(self.conf.scrapy_settings.get("LOG_STDOUT"), True)
|
||||||
|
if i == 1:
|
||||||
|
self.assertEqual(self.conf.scrapy_settings.get("LOG_LEVEL"), "WARNING")
|
||||||
|
elif i == 2:
|
||||||
|
self.assertEqual(self.conf.scrapy_settings.get("LOG_LEVEL"), "INFO")
|
||||||
|
elif i == 3:
|
||||||
|
self.assertEqual(self.conf.scrapy_settings.get("LOG_LEVEL"), "DEBUG")
|
||||||
|
|
||||||
|
self.conf.set_logging(verbose=i)
|
||||||
|
self.assertEqual(self.conf.scrapy_settings.get("LOG_FILE"), None)
|
||||||
|
|
||||||
def test_read_sourceconfiguration(self):
|
def test_read_sourceconfiguration(self):
|
||||||
config = self.conf.read_sourceconfiguration()
|
config = self.conf.read_sourceconfiguration()
|
||||||
|
@ -13,6 +13,7 @@ class TestPipelines(unittest.TestCase):
|
|||||||
def test_none_pipeline(self):
|
def test_none_pipeline(self):
|
||||||
# Testing the pipeline that replaces the None values in items.
|
# Testing the pipeline that replaces the None values in items.
|
||||||
self.testItem["value"] = "abc"
|
self.testItem["value"] = "abc"
|
||||||
|
self.testItem["source"] = None
|
||||||
pipe = pipelines.RemoveNonePipeline()
|
pipe = pipelines.RemoveNonePipeline()
|
||||||
processed = pipe.process_item(self.testItem, spider.FourmiSpider())
|
processed = pipe.process_item(self.testItem, spider.FourmiSpider())
|
||||||
|
|
||||||
|
@ -47,7 +47,6 @@ class TestFoumiSpider(unittest.TestCase):
|
|||||||
self.assertGreater(len(requests), 0)
|
self.assertGreater(len(requests), 0)
|
||||||
self.assertIsInstance(requests[0], Request)
|
self.assertIsInstance(requests[0], Request)
|
||||||
|
|
||||||
|
|
||||||
def test_synonym_requests(self):
|
def test_synonym_requests(self):
|
||||||
# A test for the synonym request function
|
# A test for the synonym request function
|
||||||
self.spi._sources = []
|
self.spi._sources = []
|
||||||
|
@ -1,7 +1,8 @@
|
|||||||
from scrapy import log
|
|
||||||
from scrapy.utils.project import get_project_settings
|
|
||||||
import ConfigParser
|
import ConfigParser
|
||||||
|
|
||||||
|
from scrapy.utils.project import get_project_settings
|
||||||
|
|
||||||
|
|
||||||
class Configurator:
|
class Configurator:
|
||||||
"""
|
"""
|
||||||
A helper class in the fourmi class. This class is used to process the settings as set
|
A helper class in the fourmi class. This class is used to process the settings as set
|
||||||
@ -11,7 +12,6 @@ class Configurator:
|
|||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.scrapy_settings = get_project_settings()
|
self.scrapy_settings = get_project_settings()
|
||||||
|
|
||||||
|
|
||||||
def set_output(self, filename, fileformat):
|
def set_output(self, filename, fileformat):
|
||||||
"""
|
"""
|
||||||
This function manipulates the Scrapy output file settings that normally would be set in the settings file.
|
This function manipulates the Scrapy output file settings that normally would be set in the settings file.
|
||||||
@ -30,23 +30,34 @@ class Configurator:
|
|||||||
if fileformat is not None:
|
if fileformat is not None:
|
||||||
self.scrapy_settings.overrides["FEED_FORMAT"] = fileformat
|
self.scrapy_settings.overrides["FEED_FORMAT"] = fileformat
|
||||||
|
|
||||||
|
def set_logging(self, logfile=None, verbose=0):
|
||||||
def start_log(self, logfile, verbose):
|
|
||||||
"""
|
"""
|
||||||
This function starts the logging functionality of Scrapy using the settings given by the CLI.
|
This function changes the default settings of Scapy's logging functionality
|
||||||
|
using the settings given by the CLI.
|
||||||
:param logfile: The location where the logfile will be saved.
|
:param logfile: The location where the logfile will be saved.
|
||||||
:param verbose: A boolean value to switch between loglevels.
|
:param verbose: A integer value to switch between loglevels.
|
||||||
"""
|
"""
|
||||||
|
if verbose != 0:
|
||||||
|
self.scrapy_settings.overrides["LOG_ENABLED"] = True
|
||||||
|
else:
|
||||||
|
self.scrapy_settings.overrides["LOG_ENABLED"] = False
|
||||||
|
|
||||||
|
if verbose == 1:
|
||||||
|
self.scrapy_settings.overrides["LOG_LEVEL"] = "WARNING"
|
||||||
|
elif verbose == 2:
|
||||||
|
self.scrapy_settings.overrides["LOG_LEVEL"] = "INFO"
|
||||||
|
else:
|
||||||
|
self.scrapy_settings.overrides["LOG_LEVEL"] = "DEBUG"
|
||||||
|
|
||||||
|
if verbose > 1:
|
||||||
|
self.scrapy_settings.overrides["LOG_STDOUT"] = False
|
||||||
|
else:
|
||||||
|
self.scrapy_settings.overrides["LOG_STDOUT"] = True
|
||||||
|
|
||||||
if logfile is not None:
|
if logfile is not None:
|
||||||
if verbose:
|
self.scrapy_settings.overrides["LOG_FILE"] = logfile
|
||||||
log.start(logfile=logfile, logstdout=False, loglevel=log.DEBUG)
|
|
||||||
else:
|
else:
|
||||||
log.start(logfile=logfile, logstdout=True, loglevel=log.WARNING)
|
self.scrapy_settings.overrides["LOG_FILE"] = None
|
||||||
else:
|
|
||||||
if verbose:
|
|
||||||
log.start(logstdout=False, loglevel=log.DEBUG)
|
|
||||||
else:
|
|
||||||
log.start(logstdout=True, loglevel=log.WARNING)
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def read_sourceconfiguration():
|
def read_sourceconfiguration():
|
||||||
@ -75,7 +86,6 @@ class Configurator:
|
|||||||
elif config.defaults():
|
elif config.defaults():
|
||||||
section = config.defaults()
|
section = config.defaults()
|
||||||
if 'reliability' not in section:
|
if 'reliability' not in section:
|
||||||
log.msg('Reliability not set for %s' % sourcename,
|
print 'WARNING: Reliability not set for %s' % sourcename
|
||||||
level=log.WARNING)
|
|
||||||
section['reliability'] = ''
|
section['reliability'] = ''
|
||||||
return section
|
return section
|
||||||
|
@ -5,6 +5,7 @@ import re
|
|||||||
from FourmiCrawler.sources.source import Source
|
from FourmiCrawler.sources.source import Source
|
||||||
from utils.configurator import Configurator
|
from utils.configurator import Configurator
|
||||||
|
|
||||||
|
|
||||||
class SourceLoader:
|
class SourceLoader:
|
||||||
sources = []
|
sources = []
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user