diff --git a/FourmiCrawler/sources/ChemSpider.py b/FourmiCrawler/sources/ChemSpider.py index 87a6ee7..0110e57 100644 --- a/FourmiCrawler/sources/ChemSpider.py +++ b/FourmiCrawler/sources/ChemSpider.py @@ -26,9 +26,8 @@ class ChemSpider(Source): structure = 'Chemical-Structure.%s.html' extendedinfo = 'MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token=' - def __init__(self, config={}): + def __init__(self, config=None): Source.__init__(self, config) - self.cfg = config self.ignore_list = [] if 'token' not in self.cfg or self.cfg['token'] == '': log.msg('ChemSpider token not set or empty, search/MassSpec API ' @@ -37,7 +36,6 @@ class ChemSpider(Source): self.search += self.cfg['token'] self.extendedinfo += self.cfg['token'] - def parse(self, response): sel = Selector(response) requests = [] @@ -199,13 +197,14 @@ class ChemSpider(Source): return properties def newresult(self, attribute, value, conditions='', source='ChemSpider'): - return Result({ + return Result( + { 'attribute': attribute, 'value': value, 'source': source, 'reliability': self.cfg['reliability'], 'conditions': conditions - }) + }) def parse_searchrequest(self, response): """Parse the initial response of the ChemSpider Search API """ diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py index 3c323ef..934b457 100644 --- a/FourmiCrawler/sources/NIST.py +++ b/FourmiCrawler/sources/NIST.py @@ -22,12 +22,9 @@ class NIST(Source): search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on' - cfg = {} - - def __init__(self, config={}): + def __init__(self, config=None): Source.__init__(self, config) self.ignore_list = set() - self.cfg = config def parse(self, response): sel = Selector(response) @@ -88,7 +85,6 @@ class NIST(Source): InChiKey, CAS number """ ul = sel.xpath('body/ul[li/strong="IUPAC Standard InChI:"]') - li = ul.xpath('li') raw_synonyms = ul.xpath('li[strong="Other names:"]/text()').extract() for synonym in raw_synonyms[0].strip().split(';\n'): @@ -255,12 +251,13 @@ class NIST(Source): return results def newresult(self, attribute, value, conditions=''): - return Result({ - 'attribute': attribute, - 'value': value, - 'source': 'NIST', - 'reliability': self.cfg['reliability'], - 'conditions': conditions + return Result( + { + 'attribute': attribute, + 'value': value, + 'source': 'NIST', + 'reliability': self.cfg['reliability'], + 'conditions': conditions }) def new_compound_request(self, compound): diff --git a/FourmiCrawler/sources/WikipediaParser.py b/FourmiCrawler/sources/WikipediaParser.py index 4aa49b2..401698c 100644 --- a/FourmiCrawler/sources/WikipediaParser.py +++ b/FourmiCrawler/sources/WikipediaParser.py @@ -1,9 +1,11 @@ +import re + from scrapy.http import Request from scrapy import log -from source import Source from scrapy.selector import Selector + +from source import Source from FourmiCrawler.items import Result -import re class WikipediaParser(Source): @@ -17,11 +19,8 @@ class WikipediaParser(Source): __spider = None searched_compounds = [] - cfg = {} - - def __init__(self, config={}): + def __init__(self, config=None): Source.__init__(self, config) - self.cfg = config def parse(self, response): """ @@ -53,7 +52,7 @@ class WikipediaParser(Source): # scrape the chembox (wikipedia template) items = self.parse_chembox(sel, items) - #scrape the drugbox (wikipedia template) + # scrape the drugbox (wikipedia template) items = self.parse_drugbox(sel, items) items = filter(lambda a: a['value'] != '', items) # remove items with an empty value @@ -123,7 +122,6 @@ class WikipediaParser(Source): level=log.DEBUG) return items - def new_compound_request(self, compound): return Request(url=self.website[:-1] + compound, callback=self.parse) @@ -161,10 +159,11 @@ class WikipediaParser(Source): return links def newresult(self, attribute, value): - return Result({ - 'attribute': attribute, - 'value': value, - 'source': 'Wikipedia', - 'reliability': self.cfg['reliability'], - 'conditions': '' + return Result( + { + 'attribute': attribute, + 'value': value, + 'source': 'Wikipedia', + 'reliability': self.cfg['reliability'], + 'conditions': '' }) diff --git a/FourmiCrawler/sources/source.py b/FourmiCrawler/sources/source.py index a609bb9..36218b0 100644 --- a/FourmiCrawler/sources/source.py +++ b/FourmiCrawler/sources/source.py @@ -6,10 +6,13 @@ class Source: website = "http://something/*" # Regex of URI's the source is able to parse _spider = None - def __init__(self, config={}): + def __init__(self, config=None): """ Initiation of a new Source """ + self.cfg = {} + if config is not None: + self.cfg = config pass def parse(self, response): diff --git a/FourmiCrawler/spider.py b/FourmiCrawler/spider.py index 5c09f07..ebfd2cf 100644 --- a/FourmiCrawler/spider.py +++ b/FourmiCrawler/spider.py @@ -10,7 +10,7 @@ class FourmiSpider(Spider): """ name = "FourmiSpider" - def __init__(self, compound=None, selected_attributes=[".*"], *args, **kwargs): + def __init__(self, compound=None, selected_attributes=None, *args, **kwargs): """ Initiation of the Spider :param compound: compound that will be searched. @@ -20,7 +20,10 @@ class FourmiSpider(Spider): self.synonyms = set() super(FourmiSpider, self).__init__(*args, **kwargs) self.synonyms.add(compound) - self.selected_attributes = selected_attributes + if selected_attributes is None: + self.selected_attributes = [".*"] + else: + self.selected_attributes = selected_attributes def parse(self, response): """ diff --git a/fourmi.py b/fourmi.py index e6d7e9a..2a422ef 100755 --- a/fourmi.py +++ b/fourmi.py @@ -5,6 +5,7 @@ Fourmi, a web scraper build to search specific information for a given compound Usage: fourmi search fourmi [options] search + fourmi [-v | -vv | -vvv] [options] search fourmi [options] [--include= | --exclude=] search fourmi list fourmi [--include= | --exclude=] list @@ -15,7 +16,7 @@ Options: --attributes= Include only that match these regular expressions split by a comma. [default: .*] -h --help Show this screen. --version Show version. - --verbose Verbose logging output. + -v Verbose logging output. (Multiple occurrences increase logging level) --log= Save log to an file. -o --output= Output file [default: results.*format*] -f --format= Output formats (supported: csv, json, jsonlines, xml) [default: csv] @@ -25,8 +26,7 @@ Options: from twisted.internet import reactor from scrapy.crawler import Crawler -from scrapy import log, signals -from scrapy.utils.project import get_project_settings +from scrapy import signals, log import docopt from FourmiCrawler.spider import FourmiSpider @@ -58,9 +58,12 @@ def search(docopt_arguments, source_loader): :param source_loader: An initiated SourceLoader object pointed at the directory with the sources. """ conf = Configurator() - conf.start_log(docopt_arguments["--log"], docopt_arguments["--verbose"]) + conf.set_logging(docopt_arguments["--log"], docopt_arguments["-v"]) conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"]) - setup_crawler(docopt_arguments[""], conf.scrapy_settings, source_loader, docopt_arguments["--attributes"].split(',')) + setup_crawler(docopt_arguments[""], conf.scrapy_settings, + source_loader, docopt_arguments["--attributes"].split(',')) + log.start(conf.scrapy_settings.get("LOG_FILE"), + conf.scrapy_settings.get("LOG_LEVEL"), conf.scrapy_settings.get("LOG_STDOUT")) reactor.run() diff --git a/tests/test_configurator.py b/tests/test_configurator.py index eb43cb7..df29da9 100644 --- a/tests/test_configurator.py +++ b/tests/test_configurator.py @@ -1,7 +1,8 @@ import unittest +import ConfigParser + from utils.configurator import Configurator -import ConfigParser class TestConfigurator(unittest.TestCase): @@ -21,11 +22,28 @@ class TestConfigurator(unittest.TestCase): self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.csv") self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv") - # def test_start_log(self): - # self.conf.start_log("test.log", True) - # self.conf.start_log("test.log", False) - # self.conf.start_log(None, True) - # self.conf.start_log(None, False) + def test_start_log(self): + for i in range(0, 3): + self.conf.set_logging("TEST", i) + self.assertEqual(self.conf.scrapy_settings.get("LOG_FILE"), "TEST") + if i > 0: + self.assertEqual(self.conf.scrapy_settings.get("LOG_ENABLED"), True) + if i > 1: + self.assertEqual(self.conf.scrapy_settings.get("LOG_STDOUT"), False) + else: + self.assertEqual(self.conf.scrapy_settings.get("LOG_STDOUT"), True) + else: + self.assertEqual(self.conf.scrapy_settings.get("LOG_ENABLED"), False) + self.assertEqual(self.conf.scrapy_settings.get("LOG_STDOUT"), True) + if i == 1: + self.assertEqual(self.conf.scrapy_settings.get("LOG_LEVEL"), "WARNING") + elif i == 2: + self.assertEqual(self.conf.scrapy_settings.get("LOG_LEVEL"), "INFO") + elif i == 3: + self.assertEqual(self.conf.scrapy_settings.get("LOG_LEVEL"), "DEBUG") + + self.conf.set_logging(verbose=i) + self.assertEqual(self.conf.scrapy_settings.get("LOG_FILE"), None) def test_read_sourceconfiguration(self): config = self.conf.read_sourceconfiguration() diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index dfb8e83..eb2b070 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -13,6 +13,7 @@ class TestPipelines(unittest.TestCase): def test_none_pipeline(self): # Testing the pipeline that replaces the None values in items. self.testItem["value"] = "abc" + self.testItem["source"] = None pipe = pipelines.RemoveNonePipeline() processed = pipe.process_item(self.testItem, spider.FourmiSpider()) diff --git a/tests/test_spider.py b/tests/test_spider.py index 589a571..1ee40b1 100644 --- a/tests/test_spider.py +++ b/tests/test_spider.py @@ -47,7 +47,6 @@ class TestFoumiSpider(unittest.TestCase): self.assertGreater(len(requests), 0) self.assertIsInstance(requests[0], Request) - def test_synonym_requests(self): # A test for the synonym request function self.spi._sources = [] diff --git a/utils/configurator.py b/utils/configurator.py index dfc6330..62987c6 100644 --- a/utils/configurator.py +++ b/utils/configurator.py @@ -1,7 +1,8 @@ -from scrapy import log -from scrapy.utils.project import get_project_settings import ConfigParser +from scrapy.utils.project import get_project_settings + + class Configurator: """ A helper class in the fourmi class. This class is used to process the settings as set @@ -11,7 +12,6 @@ class Configurator: def __init__(self): self.scrapy_settings = get_project_settings() - def set_output(self, filename, fileformat): """ This function manipulates the Scrapy output file settings that normally would be set in the settings file. @@ -30,23 +30,34 @@ class Configurator: if fileformat is not None: self.scrapy_settings.overrides["FEED_FORMAT"] = fileformat - - def start_log(self, logfile, verbose): + def set_logging(self, logfile=None, verbose=0): """ - This function starts the logging functionality of Scrapy using the settings given by the CLI. + This function changes the default settings of Scapy's logging functionality + using the settings given by the CLI. :param logfile: The location where the logfile will be saved. - :param verbose: A boolean value to switch between loglevels. + :param verbose: A integer value to switch between loglevels. """ - if logfile is not None: - if verbose: - log.start(logfile=logfile, logstdout=False, loglevel=log.DEBUG) - else: - log.start(logfile=logfile, logstdout=True, loglevel=log.WARNING) + if verbose != 0: + self.scrapy_settings.overrides["LOG_ENABLED"] = True else: - if verbose: - log.start(logstdout=False, loglevel=log.DEBUG) - else: - log.start(logstdout=True, loglevel=log.WARNING) + self.scrapy_settings.overrides["LOG_ENABLED"] = False + + if verbose == 1: + self.scrapy_settings.overrides["LOG_LEVEL"] = "WARNING" + elif verbose == 2: + self.scrapy_settings.overrides["LOG_LEVEL"] = "INFO" + else: + self.scrapy_settings.overrides["LOG_LEVEL"] = "DEBUG" + + if verbose > 1: + self.scrapy_settings.overrides["LOG_STDOUT"] = False + else: + self.scrapy_settings.overrides["LOG_STDOUT"] = True + + if logfile is not None: + self.scrapy_settings.overrides["LOG_FILE"] = logfile + else: + self.scrapy_settings.overrides["LOG_FILE"] = None @staticmethod def read_sourceconfiguration(): @@ -56,7 +67,7 @@ class Configurator: :return a ConfigParser object of sources.cfg """ config = ConfigParser.ConfigParser() - config.read('sources.cfg') # [TODO]: should be softcoded eventually + config.read('sources.cfg') # [TODO]: should be softcoded eventually return config @staticmethod @@ -75,7 +86,6 @@ class Configurator: elif config.defaults(): section = config.defaults() if 'reliability' not in section: - log.msg('Reliability not set for %s' % sourcename, - level=log.WARNING) + print 'WARNING: Reliability not set for %s' % sourcename section['reliability'] = '' return section diff --git a/utils/sourceloader.py b/utils/sourceloader.py index 9b33657..8c54464 100644 --- a/utils/sourceloader.py +++ b/utils/sourceloader.py @@ -5,6 +5,7 @@ import re from FourmiCrawler.sources.source import Source from utils.configurator import Configurator + class SourceLoader: sources = []