diff --git a/.gitignore b/.gitignore index 158ef41..14c4e72 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,9 @@ #Python Specific ignores *.pyc +#may contain authentication information +sources.cfg + #THINGS WE WOULD NEVER EVER WANT! #ignore thumbnails created by windows Thumbs.db diff --git a/.travis.yml b/.travis.yml index 34d3a88..24c5dc5 100644 --- a/.travis.yml +++ b/.travis.yml @@ -10,7 +10,7 @@ install: # command to run tests, e.g. python setup.py test script: - - nosetests --with-coverage --cover-package=FourmiCrawler tests + - nosetests --with-coverage --cover-package=FourmiCrawler,utils tests notifications: slack: descartes2:6sgCzx3PvrO9IIMwKxj12dDM diff --git a/FourmiCrawler/sources/ChemSpider.py b/FourmiCrawler/sources/ChemSpider.py index 8c0bd8b..87a6ee7 100644 --- a/FourmiCrawler/sources/ChemSpider.py +++ b/FourmiCrawler/sources/ChemSpider.py @@ -9,7 +9,7 @@ from FourmiCrawler.items import Result # [TODO] - Maybe clean up usage of '.extract()[0]', because of possible IndexError exception. - +# [TODO] - Add checks at search request and extendedCompoundInfo on whether the token was valid or not class ChemSpider(Source): """ChemSpider scraper for synonyms and properties @@ -20,19 +20,23 @@ class ChemSpider(Source): somewhere. """ - def __init__(self): - Source.__init__(self) - website = 'http://www.chemspider.com/*' - # [TODO] - Save and access token of specific user. - search = ('Search.asmx/SimpleSearch?query=%s&token=' - '052bfd06-5ce4-43d6-bf12-89eabefd2338') + search = 'Search.asmx/SimpleSearch?query=%s&token=' structure = 'Chemical-Structure.%s.html' - extendedinfo = ('MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token=' - '052bfd06-5ce4-43d6-bf12-89eabefd2338') + extendedinfo = 'MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token=' + + def __init__(self, config={}): + Source.__init__(self, config) + self.cfg = config + self.ignore_list = [] + if 'token' not in self.cfg or self.cfg['token'] == '': + log.msg('ChemSpider token not set or empty, search/MassSpec API ' + 'not available', level=log.WARNING) + self.cfg['token'] = '' + self.search += self.cfg['token'] + self.extendedinfo += self.cfg['token'] - ignore_list = [] def parse(self, response): sel = Selector(response) @@ -44,8 +48,7 @@ class ChemSpider(Source): return requests - @staticmethod - def parse_properties(sel): + def parse_properties(self, sel): """scrape Experimental Data and Predicted ACD/Labs tabs""" properties = [] @@ -76,13 +79,12 @@ class ChemSpider(Source): prop_value = m.group(1) prop_conditions = m.group(2) - new_prop = Result({ - 'attribute': prop_name, - 'value': prop_value, - 'source': 'ChemSpider Predicted - ACD/Labs Tab', - 'reliability': 'Unknown', - 'conditions': prop_conditions - }) + new_prop = self.newresult( + attribute=prop_name, + value=prop_value, + source='ChemSpider Predicted - ACD/Labs Tab', + conditions=prop_conditions + ) properties.append(new_prop) log.msg('CS prop: |%s| |%s| |%s|' % (new_prop['attribute'], new_prop['value'], new_prop['source']), @@ -100,14 +102,11 @@ class ChemSpider(Source): if line.xpath('span/text()'): property_name = line.xpath('span/text()').extract()[0].rstrip() else: - new_prop = Result({ - 'attribute': property_name[:-1], - 'value': line.xpath('text()').extract()[0].rstrip(), - 'source': line.xpath( - 'strong/text()').extract()[0].rstrip(), - 'reliability': 'Unknown', - 'conditions': '' - }) + new_prop = self.newresult( + attribute=property_name[:-1], + value=line.xpath('text()').extract()[0].rstrip(), + source=line.xpath('strong/text()').extract()[0].rstrip(), + ) properties.append(new_prop) log.msg('CS prop: |%s| |%s| |%s|' % (new_prop['attribute'], new_prop['value'], @@ -183,25 +182,31 @@ class ChemSpider(Source): } return synonym - @staticmethod - def parse_extendedinfo(response): + def parse_extendedinfo(self, response): """Scrape data from the ChemSpider GetExtendedCompoundInfo API""" sel = Selector(response) properties = [] names = sel.xpath('*').xpath('name()').extract() values = sel.xpath('*').xpath('text()').extract() for (name, value) in zip(names, values): - result = Result({ - 'attribute': name, - 'value': value, # These values have no unit! - 'source': 'ChemSpider ExtendedCompoundInfo', - 'reliability': 'Unknown', - 'conditions': '' - }) + result = self.newresult( + attribute=name, + value=value, # These values have no unit! + source='ChemSpider ExtendedCompoundInfo', + ) if result['value']: properties.append(result) return properties + def newresult(self, attribute, value, conditions='', source='ChemSpider'): + return Result({ + 'attribute': attribute, + 'value': value, + 'source': source, + 'reliability': self.cfg['reliability'], + 'conditions': conditions + }) + def parse_searchrequest(self, response): """Parse the initial response of the ChemSpider Search API """ sel = Selector(response) @@ -224,7 +229,7 @@ class ChemSpider(Source): callback=self.parse_extendedinfo)] def new_compound_request(self, compound): - if compound in self.ignore_list: # [TODO] - add regular expression + if compound in self.ignore_list or self.cfg['token'] == '': return None searchurl = self.website[:-1] + self.search % compound log.msg('chemspider compound', level=log.DEBUG) diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py index 6e8fabb..3c323ef 100644 --- a/FourmiCrawler/sources/NIST.py +++ b/FourmiCrawler/sources/NIST.py @@ -22,10 +22,12 @@ class NIST(Source): search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on' - ignore_list = set() + cfg = {} - def __init__(self): - Source.__init__(self) + def __init__(self, config={}): + Source.__init__(self, config) + self.ignore_list = set() + self.cfg = config def parse(self, response): sel = Selector(response) @@ -114,13 +116,10 @@ class NIST(Source): requests = [] for key, value in data.iteritems(): - result = Result({ - 'attribute': key, - 'value': value, - 'source': 'NIST', - 'reliability': 'Unknown', - 'conditions': '' - }) + result = self.newresult( + attribute=key, + value=value + ) requests.append(result) return requests @@ -150,19 +149,16 @@ class NIST(Source): name = m.group(1) condition = m.group(2) - result = Result({ - 'attribute': name, - 'value': data[1] + ' ' + data[2], - 'source': 'NIST', - 'reliability': 'Unknown', - 'conditions': condition - }) + result = self.newresult( + attribute=name, + value=data[1] + ' ' + data[2], + conditions=condition + ) log.msg('NIST: |%s|' % data, level=log.DEBUG) results.append(result) return results - @staticmethod - def parse_transition_data(table, summary): + def parse_transition_data(self, table, summary): """Parses the table containing properties regarding phase changes""" results = [] @@ -174,19 +170,16 @@ class NIST(Source): for tr in table.xpath('tr[td]'): tds = tr.xpath('td/text()').extract() - result = Result({ - 'attribute': summary, - 'value': tds[0] + ' ' + unit, - 'source': 'NIST', - 'reliability': 'Unknown', - 'conditions': '%s K, (%s -> %s)' % (tds[1], tds[2], tds[3]) - }) + result = self.newresult( + attribute=summary, + value=tds[0] + ' ' + unit, + conditions='%s K, (%s -> %s)' % (tds[1], tds[2], tds[3]) + ) results.append(result) return results - @staticmethod - def parse_generic_data(table, summary): + def parse_generic_data(self, table, summary): """Parses the common tables of 4 and 5 rows. Assumes they are of the form: Symbol (unit)|Temperature (K)|Method|Reference|Comment @@ -202,36 +195,30 @@ class NIST(Source): for tr in table.xpath('tr[td]'): tds = tr.xpath('td/text()').extract() - result = Result({ - 'attribute': summary, - 'value': tds[0] + ' ' + unit, - 'source': 'NIST', - 'reliability': 'Unknown', - 'conditions': '%s K' % tds[1] - }) + result = self.newresult( + attribute=summary, + value=tds[0] + ' ' + unit, + conditions='%s K' % tds[1] + ) results.append(result) return results - @staticmethod - def parse_antoine_data(table, summary): + def parse_antoine_data(self, table, summary): """Parse table containing parameters for the Antione equation""" results = [] for tr in table.xpath('tr[td]'): tds = tr.xpath('td/text()').extract() - result = Result({ - 'attribute': summary, - 'value': 'A=%s, B=%s, C=%s' % (tds[1], tds[2], tds[3]), - 'source': 'NIST', - 'reliability': 'Unknown', - 'conditions': '%s K' % tds[0] - }) + result = self.newresult( + attribute=summary, + value='A=%s, B=%s, C=%s' % (tds[1], tds[2], tds[3]), + conditions='%s K' % tds[0] + ) results.append(result) return results - @staticmethod - def parse_individual_datapoints(response): + def parse_individual_datapoints(self, response): """Parses the page linked from aggregate data""" sel = Selector(response) table = sel.xpath('//table[@class="data"]')[0] @@ -258,17 +245,24 @@ class NIST(Source): if m: uncertainty = '+- %s ' % m.group(1) # [TODO]: get the plusminus sign working in here - result = Result({ - 'attribute': name, - 'value': '%s %s%s' % (tds[0], uncertainty, unit), - 'source': 'NIST', - 'reliability': 'Unknown', - 'conditions': condition - }) + result = self.newresult( + attribute=name, + value='%s %s%s' % (tds[0], uncertainty, unit), + conditions=condition + ) results.append(result) return results + def newresult(self, attribute, value, conditions=''): + return Result({ + 'attribute': attribute, + 'value': value, + 'source': 'NIST', + 'reliability': self.cfg['reliability'], + 'conditions': conditions + }) + def new_compound_request(self, compound): if compound not in self.ignore_list: self.ignore_list.update(compound) diff --git a/FourmiCrawler/sources/WikipediaParser.py b/FourmiCrawler/sources/WikipediaParser.py index 868b49f..8722cef 100644 --- a/FourmiCrawler/sources/WikipediaParser.py +++ b/FourmiCrawler/sources/WikipediaParser.py @@ -19,8 +19,11 @@ class WikipediaParser(Source): __spider = None searched_compounds = [] - def __init__(self): - Source.__init__(self) + cfg = {} + + def __init__(self, config={}): + Source.__init__(self, config) + self.cfg = config def parse(self, response): """ Distributes the above described behaviour """ @@ -44,13 +47,10 @@ class WikipediaParser(Source): prop_names = tr_list[::2] prop_values = tr_list[1::2] for i, prop_name in enumerate(prop_names): - item = Result({ - 'attribute': prop_name.extract().encode('utf-8'), - 'value': prop_values[i].extract().encode('utf-8'), - 'source': "Wikipedia", - 'reliability': "Unknown", - 'conditions': "" - }) + item = self.newresult( + attribute=prop_name.extract().encode('utf-8'), + value=prop_values[i].extract().encode('utf-8') + ) items.append(item) log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG) @@ -61,13 +61,10 @@ class WikipediaParser(Source): log.msg('item: %s' % tablerow.xpath('./th').xpath('normalize-space(string())'), level=log.DEBUG) if tablerow.xpath('./th').xpath('normalize-space(string())') and tablerow.xpath('./td').xpath( 'normalize-space(string())'): - item = Result({ - 'attribute': tablerow.xpath('./th').xpath('normalize-space(string())').extract()[0].encode('utf-8'), - 'value': tablerow.xpath('./td').xpath('normalize-space(string())').extract()[0].encode('utf-8'), - 'source': "Wikipedia", - 'reliability': "Unknown", - 'conditions': "" - }) + item = self.newresult( + attribute=tablerow.xpath('./th').xpath('normalize-space(string())').extract()[0].encode('utf-8'), + value=tablerow.xpath('./td').xpath('normalize-space(string())').extract()[0].encode('utf-8'), + ) items.append(item) log.msg( 'Wiki prop: |attribute: %s| |value: %s| |%s|' % (item['attribute'], item['value'], item['source']), @@ -116,4 +113,13 @@ class WikipediaParser(Source): """ find external links, named 'Identifiers' to different sources. """ links = sel.xpath('//span[contains(concat(" ",normalize-space(@class)," "),"reflink")]/a' '[contains(concat(" ",normalize-space(@class)," "),"external")]/@href').extract() - return links \ No newline at end of file + return links + + def newresult(self, attribute, value): + return Result({ + 'attribute': attribute, + 'value': value, + 'source': 'Wikipedia', + 'reliability': self.cfg['reliability'], + 'conditions': '' + }) diff --git a/FourmiCrawler/sources/source.py b/FourmiCrawler/sources/source.py index d289d72..a609bb9 100644 --- a/FourmiCrawler/sources/source.py +++ b/FourmiCrawler/sources/source.py @@ -6,7 +6,7 @@ class Source: website = "http://something/*" # Regex of URI's the source is able to parse _spider = None - def __init__(self): + def __init__(self, config={}): """ Initiation of a new Source """ diff --git a/FourmiCrawler/spider.py b/FourmiCrawler/spider.py index 60f7363..5c09f07 100644 --- a/FourmiCrawler/spider.py +++ b/FourmiCrawler/spider.py @@ -9,8 +9,6 @@ class FourmiSpider(Spider): A spider writen for the Fourmi Project which calls upon all available sources to request and scrape data. """ name = "FourmiSpider" - _sources = [] - synonyms = set() def __init__(self, compound=None, selected_attributes=[".*"], *args, **kwargs): """ @@ -18,6 +16,8 @@ class FourmiSpider(Spider): :param compound: compound that will be searched. :param selected_attributes: A list of regular expressions that the attributes should match. """ + self._sources = [] + self.synonyms = set() super(FourmiSpider, self).__init__(*args, **kwargs) self.synonyms.add(compound) self.selected_attributes = selected_attributes diff --git a/README.md b/README.md index 2b286a0..48b0419 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ # Fourmi -**Master branch**: [![Build Status](https://travis-ci.org/Recondor/Fourmi.svg?branch=master)](https://travis-ci.org/Recondor/Fourmi) +**Master branch**: [![Build Status](https://travis-ci.org/jjdekker/Fourmi.svg?branch=master)](https://travis-ci.org/jjdekker/Fourmi) [![Coverage Status](https://img.shields.io/coveralls/jjdekker/Fourmi.svg)](https://coveralls.io/r/jjdekker/Fourmi?branch=master) -**Developing branch**: [![Build Status](https://travis-ci.org/Recondor/Fourmi.svg?branch=develop)](https://travis-ci.org/Recondor/Fourmi) +**Developing branch**: [![Build Status](https://travis-ci.org/jjdekker/Fourmi.svg?branch=develop)](https://travis-ci.org/jjdekker/Fourmi) [![Coverage Status](https://img.shields.io/coveralls/jjdekker/Fourmi.svg)](https://coveralls.io/r/jjdekker/Fourmi?branch=develop) Fourmi is an web scraper for chemical substances. The program is designed to be used as a search engine to search multiple chemical databases for a specific diff --git a/fourmi.py b/fourmi.py index 3596cf3..e6d7e9a 100755 --- a/fourmi.py +++ b/fourmi.py @@ -17,8 +17,8 @@ Options: --version Show version. --verbose Verbose logging output. --log= Save log to an file. - -o --output= Output file [default: result.*format*] - -f --format= Output formats (supported: csv, json, jsonlines, xml) [default: jsonlines] + -o --output= Output file [default: results.*format*] + -f --format= Output formats (supported: csv, json, jsonlines, xml) [default: csv] --include= Include only sources that match these regular expressions split by a comma. --exclude= Exclude the sources that match these regular expressions split by a comma. """ @@ -30,7 +30,8 @@ from scrapy.utils.project import get_project_settings import docopt from FourmiCrawler.spider import FourmiSpider -from sourceloader import SourceLoader +from utils.configurator import Configurator +from utils.sourceloader import SourceLoader def setup_crawler(compound, settings, source_loader, attributes): @@ -50,59 +51,22 @@ def setup_crawler(compound, settings, source_loader, attributes): crawler.start() -def scrapy_settings_manipulation(docopt_arguments): - """ - This function manipulates the Scrapy settings that normally would be set in the settings file. In the Fourmi - project these are command line arguments. - :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments. - """ - settings = get_project_settings() - - if docopt_arguments["--output"] != 'result.*format*': - settings.overrides["FEED_URI"] = docopt_arguments["--output"] - elif docopt_arguments["--format"] == "jsonlines": - settings.overrides["FEED_URI"] = "results.json" - elif docopt_arguments["--format"] is not None: - settings.overrides["FEED_URI"] = "results." + docopt_arguments["--format"] - - if docopt_arguments["--format"] is not None: - settings.overrides["FEED_FORMAT"] = docopt_arguments["--format"] - - return settings - - -def start_log(docopt_arguments): - """ - This function starts the logging functionality of Scrapy using the settings given by the CLI. - :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments. - """ - if docopt_arguments["--log"] is not None: - if docopt_arguments["--verbose"]: - log.start(logfile=docopt_arguments["--log"], logstdout=False, loglevel=log.DEBUG) - else: - log.start(logfile=docopt_arguments["--log"], logstdout=True, loglevel=log.WARNING) - else: - if docopt_arguments["--verbose"]: - log.start(logstdout=False, loglevel=log.DEBUG) - else: - log.start(logstdout=True, loglevel=log.WARNING) - - def search(docopt_arguments, source_loader): """ The function that facilitates the search for a specific compound. :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments. :param source_loader: An initiated SourceLoader object pointed at the directory with the sources. """ - start_log(docopt_arguments) - settings = scrapy_settings_manipulation(docopt_arguments) - setup_crawler(docopt_arguments[""], settings, source_loader, docopt_arguments["--attributes"].split(',')) + conf = Configurator() + conf.start_log(docopt_arguments["--log"], docopt_arguments["--verbose"]) + conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"]) + setup_crawler(docopt_arguments[""], conf.scrapy_settings, source_loader, docopt_arguments["--attributes"].split(',')) reactor.run() # The start for the Fourmi Command Line interface. if __name__ == '__main__': - arguments = docopt.docopt(__doc__, version='Fourmi - V0.4.2') + arguments = docopt.docopt(__doc__, version='Fourmi - V0.5.0') loader = SourceLoader() if arguments["--include"]: diff --git a/tests/test_configurator.py b/tests/test_configurator.py new file mode 100644 index 0000000..eb43cb7 --- /dev/null +++ b/tests/test_configurator.py @@ -0,0 +1,50 @@ +import unittest +from utils.configurator import Configurator + +import ConfigParser + +class TestConfigurator(unittest.TestCase): + + def setUp(self): + self.conf = Configurator() + + def test_set_output(self): + self.conf.set_output(filename="test.txt", fileformat="csv") + self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.txt") + self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv") + + self.conf.set_output("results.*format*", "jsonlines") + self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.json") + self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "jsonlines") + + self.conf.set_output("results.*format*", "csv") + self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.csv") + self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv") + + # def test_start_log(self): + # self.conf.start_log("test.log", True) + # self.conf.start_log("test.log", False) + # self.conf.start_log(None, True) + # self.conf.start_log(None, False) + + def test_read_sourceconfiguration(self): + config = self.conf.read_sourceconfiguration() + self.assertIsInstance(config, ConfigParser.ConfigParser) + + def test_get_section(self): + config = ConfigParser.ConfigParser() + section = self.conf.get_section(config, 'test') + self.assertIn('reliability', section) + self.assertEquals(section['reliability'], '') + + config.set('DEFAULT', 'reliability', 'Low') + + section = self.conf.get_section(config, 'test') + self.assertEquals(section['reliability'], 'Low') + + config.add_section('test') + config.set('test', 'var', 'Maybe') + + section = self.conf.get_section(config, 'test') + self.assertEquals(section['reliability'], 'Low') + self.assertEqual(section['var'], 'Maybe') diff --git a/tests/test_sourceloader.py b/tests/test_sourceloader.py index 1afca2d..9e62057 100644 --- a/tests/test_sourceloader.py +++ b/tests/test_sourceloader.py @@ -1,6 +1,6 @@ import unittest -from sourceloader import SourceLoader +from utils.sourceloader import SourceLoader class TestSourceloader(unittest.TestCase): diff --git a/tests/test_spider.py b/tests/test_spider.py index 89d6cfc..589a571 100644 --- a/tests/test_spider.py +++ b/tests/test_spider.py @@ -3,7 +3,7 @@ import unittest from scrapy.http import Request from FourmiCrawler import spider -from FourmiCrawler.sources.ChemSpider import ChemSpider +from FourmiCrawler.sources.NIST import NIST from FourmiCrawler.sources.source import Source @@ -41,7 +41,7 @@ class TestFoumiSpider(unittest.TestCase): self.spi.add_source(src) self.assertEqual(self.spi.start_requests(), []) - src2 = ChemSpider() + src2 = NIST() self.spi.add_source(src2) requests = self.spi.start_requests() self.assertGreater(len(requests), 0) @@ -57,8 +57,8 @@ class TestFoumiSpider(unittest.TestCase): self.assertEqual(self.spi.get_synonym_requests("new_compound"), []) self.assertIn("new_compound", self.spi.synonyms) - src2 = ChemSpider() + src2 = NIST() self.spi.add_source(src2) self.assertIsInstance(self.spi.get_synonym_requests("other_compound")[0], Request) self.assertIn("other_compound", self.spi.synonyms) - self.assertEqual(self.spi.get_synonym_requests("other_compound"), []) \ No newline at end of file + self.assertEqual(self.spi.get_synonym_requests("other_compound"), []) diff --git a/utils/__init__.py b/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/utils/configurator.py b/utils/configurator.py new file mode 100644 index 0000000..dfc6330 --- /dev/null +++ b/utils/configurator.py @@ -0,0 +1,81 @@ +from scrapy import log +from scrapy.utils.project import get_project_settings +import ConfigParser + +class Configurator: + """ + A helper class in the fourmi class. This class is used to process the settings as set + from one of the Fourmi applications. + """ + + def __init__(self): + self.scrapy_settings = get_project_settings() + + + def set_output(self, filename, fileformat): + """ + This function manipulates the Scrapy output file settings that normally would be set in the settings file. + In the Fourmi project these are command line arguments. + :param filename: The filename of the file where the output will be put. + :param fileformat: The format in which the output will be. + """ + + if filename != 'results.*format*': + self.scrapy_settings.overrides["FEED_URI"] = filename + elif fileformat == "jsonlines": + self.scrapy_settings.overrides["FEED_URI"] = "results.json" + elif fileformat is not None: + self.scrapy_settings.overrides["FEED_URI"] = "results." + fileformat + + if fileformat is not None: + self.scrapy_settings.overrides["FEED_FORMAT"] = fileformat + + + def start_log(self, logfile, verbose): + """ + This function starts the logging functionality of Scrapy using the settings given by the CLI. + :param logfile: The location where the logfile will be saved. + :param verbose: A boolean value to switch between loglevels. + """ + if logfile is not None: + if verbose: + log.start(logfile=logfile, logstdout=False, loglevel=log.DEBUG) + else: + log.start(logfile=logfile, logstdout=True, loglevel=log.WARNING) + else: + if verbose: + log.start(logstdout=False, loglevel=log.DEBUG) + else: + log.start(logstdout=True, loglevel=log.WARNING) + + @staticmethod + def read_sourceconfiguration(): + """ + This function reads sources.cfg in the main folder for configuration + variables for sources + :return a ConfigParser object of sources.cfg + """ + config = ConfigParser.ConfigParser() + config.read('sources.cfg') # [TODO]: should be softcoded eventually + return config + + @staticmethod + def get_section(config, sourcename): + """ + This function reads a config section labeled in variable sourcename and + tests whether the reliability variable is set else set to empty string. + Return the default section if the labeled config section does not exist + :param config: a ConfigParser object + :param sourcename: the name of the section to be read + :return a dictionary of the section in the config labeled in sourcename + """ + section = dict() + if config.has_section(sourcename): + section = dict(config.items(sourcename)) + elif config.defaults(): + section = config.defaults() + if 'reliability' not in section: + log.msg('Reliability not set for %s' % sourcename, + level=log.WARNING) + section['reliability'] = '' + return section diff --git a/sourceloader.py b/utils/sourceloader.py similarity index 78% rename from sourceloader.py rename to utils/sourceloader.py index 2ed50a8..9b33657 100644 --- a/sourceloader.py +++ b/utils/sourceloader.py @@ -3,26 +3,31 @@ import os import re from FourmiCrawler.sources.source import Source - +from utils.configurator import Configurator class SourceLoader: sources = [] - def __init__(self, rel_dir="FourmiCrawler/sources"): + def __init__(self, rel_dir="../FourmiCrawler/sources"): """ The initiation of a SourceLoader, selects and indexes a directory for usable sources. + Also loads a configuration file for Sources and passes the arguments in + the named section to the source :param rel_dir: A relative path to a directory. """ path = os.path.dirname(os.path.abspath(__file__)) path += "/" + rel_dir known_parser = set() + config = Configurator.read_sourceconfiguration() + for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']: - mod = __import__('.'.join([rel_dir.replace("/", "."), py]), fromlist=[py]) + mod = __import__('.'.join([rel_dir.replace("../", "").replace("/", "."), py]), fromlist=[py]) classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))] for cls in classes: if issubclass(cls, Source) and cls not in known_parser: - self.sources.append(cls()) # [review] - Would we ever need arguments for the parsers? + sourcecfg = Configurator.get_section(config, cls.__name__) + self.sources.append(cls(sourcecfg)) known_parser.add(cls) def include(self, source_names): @@ -55,4 +60,4 @@ class SourceLoader: string += "Source: " + src.__class__.__name__ string += " - " string += "URI: " + src.website + "\n" - return string \ No newline at end of file + return string