diff --git a/.gitignore b/.gitignore index 158ef41..14c4e72 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,9 @@ #Python Specific ignores *.pyc +#may contain authentication information +sources.cfg + #THINGS WE WOULD NEVER EVER WANT! #ignore thumbnails created by windows Thumbs.db diff --git a/FourmiCrawler/sources/ChemSpider.py b/FourmiCrawler/sources/ChemSpider.py index 8c0bd8b..87a6ee7 100644 --- a/FourmiCrawler/sources/ChemSpider.py +++ b/FourmiCrawler/sources/ChemSpider.py @@ -9,7 +9,7 @@ from FourmiCrawler.items import Result # [TODO] - Maybe clean up usage of '.extract()[0]', because of possible IndexError exception. - +# [TODO] - Add checks at search request and extendedCompoundInfo on whether the token was valid or not class ChemSpider(Source): """ChemSpider scraper for synonyms and properties @@ -20,19 +20,23 @@ class ChemSpider(Source): somewhere. """ - def __init__(self): - Source.__init__(self) - website = 'http://www.chemspider.com/*' - # [TODO] - Save and access token of specific user. - search = ('Search.asmx/SimpleSearch?query=%s&token=' - '052bfd06-5ce4-43d6-bf12-89eabefd2338') + search = 'Search.asmx/SimpleSearch?query=%s&token=' structure = 'Chemical-Structure.%s.html' - extendedinfo = ('MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token=' - '052bfd06-5ce4-43d6-bf12-89eabefd2338') + extendedinfo = 'MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token=' + + def __init__(self, config={}): + Source.__init__(self, config) + self.cfg = config + self.ignore_list = [] + if 'token' not in self.cfg or self.cfg['token'] == '': + log.msg('ChemSpider token not set or empty, search/MassSpec API ' + 'not available', level=log.WARNING) + self.cfg['token'] = '' + self.search += self.cfg['token'] + self.extendedinfo += self.cfg['token'] - ignore_list = [] def parse(self, response): sel = Selector(response) @@ -44,8 +48,7 @@ class ChemSpider(Source): return requests - @staticmethod - def parse_properties(sel): + def parse_properties(self, sel): """scrape Experimental Data and Predicted ACD/Labs tabs""" properties = [] @@ -76,13 +79,12 @@ class ChemSpider(Source): prop_value = m.group(1) prop_conditions = m.group(2) - new_prop = Result({ - 'attribute': prop_name, - 'value': prop_value, - 'source': 'ChemSpider Predicted - ACD/Labs Tab', - 'reliability': 'Unknown', - 'conditions': prop_conditions - }) + new_prop = self.newresult( + attribute=prop_name, + value=prop_value, + source='ChemSpider Predicted - ACD/Labs Tab', + conditions=prop_conditions + ) properties.append(new_prop) log.msg('CS prop: |%s| |%s| |%s|' % (new_prop['attribute'], new_prop['value'], new_prop['source']), @@ -100,14 +102,11 @@ class ChemSpider(Source): if line.xpath('span/text()'): property_name = line.xpath('span/text()').extract()[0].rstrip() else: - new_prop = Result({ - 'attribute': property_name[:-1], - 'value': line.xpath('text()').extract()[0].rstrip(), - 'source': line.xpath( - 'strong/text()').extract()[0].rstrip(), - 'reliability': 'Unknown', - 'conditions': '' - }) + new_prop = self.newresult( + attribute=property_name[:-1], + value=line.xpath('text()').extract()[0].rstrip(), + source=line.xpath('strong/text()').extract()[0].rstrip(), + ) properties.append(new_prop) log.msg('CS prop: |%s| |%s| |%s|' % (new_prop['attribute'], new_prop['value'], @@ -183,25 +182,31 @@ class ChemSpider(Source): } return synonym - @staticmethod - def parse_extendedinfo(response): + def parse_extendedinfo(self, response): """Scrape data from the ChemSpider GetExtendedCompoundInfo API""" sel = Selector(response) properties = [] names = sel.xpath('*').xpath('name()').extract() values = sel.xpath('*').xpath('text()').extract() for (name, value) in zip(names, values): - result = Result({ - 'attribute': name, - 'value': value, # These values have no unit! - 'source': 'ChemSpider ExtendedCompoundInfo', - 'reliability': 'Unknown', - 'conditions': '' - }) + result = self.newresult( + attribute=name, + value=value, # These values have no unit! + source='ChemSpider ExtendedCompoundInfo', + ) if result['value']: properties.append(result) return properties + def newresult(self, attribute, value, conditions='', source='ChemSpider'): + return Result({ + 'attribute': attribute, + 'value': value, + 'source': source, + 'reliability': self.cfg['reliability'], + 'conditions': conditions + }) + def parse_searchrequest(self, response): """Parse the initial response of the ChemSpider Search API """ sel = Selector(response) @@ -224,7 +229,7 @@ class ChemSpider(Source): callback=self.parse_extendedinfo)] def new_compound_request(self, compound): - if compound in self.ignore_list: # [TODO] - add regular expression + if compound in self.ignore_list or self.cfg['token'] == '': return None searchurl = self.website[:-1] + self.search % compound log.msg('chemspider compound', level=log.DEBUG) diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py index 6e8fabb..3c323ef 100644 --- a/FourmiCrawler/sources/NIST.py +++ b/FourmiCrawler/sources/NIST.py @@ -22,10 +22,12 @@ class NIST(Source): search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on' - ignore_list = set() + cfg = {} - def __init__(self): - Source.__init__(self) + def __init__(self, config={}): + Source.__init__(self, config) + self.ignore_list = set() + self.cfg = config def parse(self, response): sel = Selector(response) @@ -114,13 +116,10 @@ class NIST(Source): requests = [] for key, value in data.iteritems(): - result = Result({ - 'attribute': key, - 'value': value, - 'source': 'NIST', - 'reliability': 'Unknown', - 'conditions': '' - }) + result = self.newresult( + attribute=key, + value=value + ) requests.append(result) return requests @@ -150,19 +149,16 @@ class NIST(Source): name = m.group(1) condition = m.group(2) - result = Result({ - 'attribute': name, - 'value': data[1] + ' ' + data[2], - 'source': 'NIST', - 'reliability': 'Unknown', - 'conditions': condition - }) + result = self.newresult( + attribute=name, + value=data[1] + ' ' + data[2], + conditions=condition + ) log.msg('NIST: |%s|' % data, level=log.DEBUG) results.append(result) return results - @staticmethod - def parse_transition_data(table, summary): + def parse_transition_data(self, table, summary): """Parses the table containing properties regarding phase changes""" results = [] @@ -174,19 +170,16 @@ class NIST(Source): for tr in table.xpath('tr[td]'): tds = tr.xpath('td/text()').extract() - result = Result({ - 'attribute': summary, - 'value': tds[0] + ' ' + unit, - 'source': 'NIST', - 'reliability': 'Unknown', - 'conditions': '%s K, (%s -> %s)' % (tds[1], tds[2], tds[3]) - }) + result = self.newresult( + attribute=summary, + value=tds[0] + ' ' + unit, + conditions='%s K, (%s -> %s)' % (tds[1], tds[2], tds[3]) + ) results.append(result) return results - @staticmethod - def parse_generic_data(table, summary): + def parse_generic_data(self, table, summary): """Parses the common tables of 4 and 5 rows. Assumes they are of the form: Symbol (unit)|Temperature (K)|Method|Reference|Comment @@ -202,36 +195,30 @@ class NIST(Source): for tr in table.xpath('tr[td]'): tds = tr.xpath('td/text()').extract() - result = Result({ - 'attribute': summary, - 'value': tds[0] + ' ' + unit, - 'source': 'NIST', - 'reliability': 'Unknown', - 'conditions': '%s K' % tds[1] - }) + result = self.newresult( + attribute=summary, + value=tds[0] + ' ' + unit, + conditions='%s K' % tds[1] + ) results.append(result) return results - @staticmethod - def parse_antoine_data(table, summary): + def parse_antoine_data(self, table, summary): """Parse table containing parameters for the Antione equation""" results = [] for tr in table.xpath('tr[td]'): tds = tr.xpath('td/text()').extract() - result = Result({ - 'attribute': summary, - 'value': 'A=%s, B=%s, C=%s' % (tds[1], tds[2], tds[3]), - 'source': 'NIST', - 'reliability': 'Unknown', - 'conditions': '%s K' % tds[0] - }) + result = self.newresult( + attribute=summary, + value='A=%s, B=%s, C=%s' % (tds[1], tds[2], tds[3]), + conditions='%s K' % tds[0] + ) results.append(result) return results - @staticmethod - def parse_individual_datapoints(response): + def parse_individual_datapoints(self, response): """Parses the page linked from aggregate data""" sel = Selector(response) table = sel.xpath('//table[@class="data"]')[0] @@ -258,17 +245,24 @@ class NIST(Source): if m: uncertainty = '+- %s ' % m.group(1) # [TODO]: get the plusminus sign working in here - result = Result({ - 'attribute': name, - 'value': '%s %s%s' % (tds[0], uncertainty, unit), - 'source': 'NIST', - 'reliability': 'Unknown', - 'conditions': condition - }) + result = self.newresult( + attribute=name, + value='%s %s%s' % (tds[0], uncertainty, unit), + conditions=condition + ) results.append(result) return results + def newresult(self, attribute, value, conditions=''): + return Result({ + 'attribute': attribute, + 'value': value, + 'source': 'NIST', + 'reliability': self.cfg['reliability'], + 'conditions': conditions + }) + def new_compound_request(self, compound): if compound not in self.ignore_list: self.ignore_list.update(compound) diff --git a/FourmiCrawler/sources/WikipediaParser.py b/FourmiCrawler/sources/WikipediaParser.py index 868b49f..8722cef 100644 --- a/FourmiCrawler/sources/WikipediaParser.py +++ b/FourmiCrawler/sources/WikipediaParser.py @@ -19,8 +19,11 @@ class WikipediaParser(Source): __spider = None searched_compounds = [] - def __init__(self): - Source.__init__(self) + cfg = {} + + def __init__(self, config={}): + Source.__init__(self, config) + self.cfg = config def parse(self, response): """ Distributes the above described behaviour """ @@ -44,13 +47,10 @@ class WikipediaParser(Source): prop_names = tr_list[::2] prop_values = tr_list[1::2] for i, prop_name in enumerate(prop_names): - item = Result({ - 'attribute': prop_name.extract().encode('utf-8'), - 'value': prop_values[i].extract().encode('utf-8'), - 'source': "Wikipedia", - 'reliability': "Unknown", - 'conditions': "" - }) + item = self.newresult( + attribute=prop_name.extract().encode('utf-8'), + value=prop_values[i].extract().encode('utf-8') + ) items.append(item) log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG) @@ -61,13 +61,10 @@ class WikipediaParser(Source): log.msg('item: %s' % tablerow.xpath('./th').xpath('normalize-space(string())'), level=log.DEBUG) if tablerow.xpath('./th').xpath('normalize-space(string())') and tablerow.xpath('./td').xpath( 'normalize-space(string())'): - item = Result({ - 'attribute': tablerow.xpath('./th').xpath('normalize-space(string())').extract()[0].encode('utf-8'), - 'value': tablerow.xpath('./td').xpath('normalize-space(string())').extract()[0].encode('utf-8'), - 'source': "Wikipedia", - 'reliability': "Unknown", - 'conditions': "" - }) + item = self.newresult( + attribute=tablerow.xpath('./th').xpath('normalize-space(string())').extract()[0].encode('utf-8'), + value=tablerow.xpath('./td').xpath('normalize-space(string())').extract()[0].encode('utf-8'), + ) items.append(item) log.msg( 'Wiki prop: |attribute: %s| |value: %s| |%s|' % (item['attribute'], item['value'], item['source']), @@ -116,4 +113,13 @@ class WikipediaParser(Source): """ find external links, named 'Identifiers' to different sources. """ links = sel.xpath('//span[contains(concat(" ",normalize-space(@class)," "),"reflink")]/a' '[contains(concat(" ",normalize-space(@class)," "),"external")]/@href').extract() - return links \ No newline at end of file + return links + + def newresult(self, attribute, value): + return Result({ + 'attribute': attribute, + 'value': value, + 'source': 'Wikipedia', + 'reliability': self.cfg['reliability'], + 'conditions': '' + }) diff --git a/FourmiCrawler/sources/source.py b/FourmiCrawler/sources/source.py index d289d72..a609bb9 100644 --- a/FourmiCrawler/sources/source.py +++ b/FourmiCrawler/sources/source.py @@ -6,7 +6,7 @@ class Source: website = "http://something/*" # Regex of URI's the source is able to parse _spider = None - def __init__(self): + def __init__(self, config={}): """ Initiation of a new Source """ diff --git a/FourmiCrawler/spider.py b/FourmiCrawler/spider.py index 60f7363..5c09f07 100644 --- a/FourmiCrawler/spider.py +++ b/FourmiCrawler/spider.py @@ -9,8 +9,6 @@ class FourmiSpider(Spider): A spider writen for the Fourmi Project which calls upon all available sources to request and scrape data. """ name = "FourmiSpider" - _sources = [] - synonyms = set() def __init__(self, compound=None, selected_attributes=[".*"], *args, **kwargs): """ @@ -18,6 +16,8 @@ class FourmiSpider(Spider): :param compound: compound that will be searched. :param selected_attributes: A list of regular expressions that the attributes should match. """ + self._sources = [] + self.synonyms = set() super(FourmiSpider, self).__init__(*args, **kwargs) self.synonyms.add(compound) self.selected_attributes = selected_attributes diff --git a/tests/test_configurator.py b/tests/test_configurator.py index 8cc61ea..eb43cb7 100644 --- a/tests/test_configurator.py +++ b/tests/test_configurator.py @@ -1,6 +1,7 @@ import unittest from utils.configurator import Configurator +import ConfigParser class TestConfigurator(unittest.TestCase): @@ -24,4 +25,26 @@ class TestConfigurator(unittest.TestCase): # self.conf.start_log("test.log", True) # self.conf.start_log("test.log", False) # self.conf.start_log(None, True) - # self.conf.start_log(None, False) \ No newline at end of file + # self.conf.start_log(None, False) + + def test_read_sourceconfiguration(self): + config = self.conf.read_sourceconfiguration() + self.assertIsInstance(config, ConfigParser.ConfigParser) + + def test_get_section(self): + config = ConfigParser.ConfigParser() + section = self.conf.get_section(config, 'test') + self.assertIn('reliability', section) + self.assertEquals(section['reliability'], '') + + config.set('DEFAULT', 'reliability', 'Low') + + section = self.conf.get_section(config, 'test') + self.assertEquals(section['reliability'], 'Low') + + config.add_section('test') + config.set('test', 'var', 'Maybe') + + section = self.conf.get_section(config, 'test') + self.assertEquals(section['reliability'], 'Low') + self.assertEqual(section['var'], 'Maybe') diff --git a/tests/test_spider.py b/tests/test_spider.py index 89d6cfc..589a571 100644 --- a/tests/test_spider.py +++ b/tests/test_spider.py @@ -3,7 +3,7 @@ import unittest from scrapy.http import Request from FourmiCrawler import spider -from FourmiCrawler.sources.ChemSpider import ChemSpider +from FourmiCrawler.sources.NIST import NIST from FourmiCrawler.sources.source import Source @@ -41,7 +41,7 @@ class TestFoumiSpider(unittest.TestCase): self.spi.add_source(src) self.assertEqual(self.spi.start_requests(), []) - src2 = ChemSpider() + src2 = NIST() self.spi.add_source(src2) requests = self.spi.start_requests() self.assertGreater(len(requests), 0) @@ -57,8 +57,8 @@ class TestFoumiSpider(unittest.TestCase): self.assertEqual(self.spi.get_synonym_requests("new_compound"), []) self.assertIn("new_compound", self.spi.synonyms) - src2 = ChemSpider() + src2 = NIST() self.spi.add_source(src2) self.assertIsInstance(self.spi.get_synonym_requests("other_compound")[0], Request) self.assertIn("other_compound", self.spi.synonyms) - self.assertEqual(self.spi.get_synonym_requests("other_compound"), []) \ No newline at end of file + self.assertEqual(self.spi.get_synonym_requests("other_compound"), []) diff --git a/utils/configurator.py b/utils/configurator.py index 90e0320..dfc6330 100644 --- a/utils/configurator.py +++ b/utils/configurator.py @@ -1,6 +1,6 @@ from scrapy import log from scrapy.utils.project import get_project_settings - +import ConfigParser class Configurator: """ @@ -47,3 +47,35 @@ class Configurator: log.start(logstdout=False, loglevel=log.DEBUG) else: log.start(logstdout=True, loglevel=log.WARNING) + + @staticmethod + def read_sourceconfiguration(): + """ + This function reads sources.cfg in the main folder for configuration + variables for sources + :return a ConfigParser object of sources.cfg + """ + config = ConfigParser.ConfigParser() + config.read('sources.cfg') # [TODO]: should be softcoded eventually + return config + + @staticmethod + def get_section(config, sourcename): + """ + This function reads a config section labeled in variable sourcename and + tests whether the reliability variable is set else set to empty string. + Return the default section if the labeled config section does not exist + :param config: a ConfigParser object + :param sourcename: the name of the section to be read + :return a dictionary of the section in the config labeled in sourcename + """ + section = dict() + if config.has_section(sourcename): + section = dict(config.items(sourcename)) + elif config.defaults(): + section = config.defaults() + if 'reliability' not in section: + log.msg('Reliability not set for %s' % sourcename, + level=log.WARNING) + section['reliability'] = '' + return section diff --git a/utils/sourceloader.py b/utils/sourceloader.py index b6bb0fd..9b33657 100644 --- a/utils/sourceloader.py +++ b/utils/sourceloader.py @@ -3,7 +3,7 @@ import os import re from FourmiCrawler.sources.source import Source - +from utils.configurator import Configurator class SourceLoader: sources = [] @@ -11,18 +11,23 @@ class SourceLoader: def __init__(self, rel_dir="../FourmiCrawler/sources"): """ The initiation of a SourceLoader, selects and indexes a directory for usable sources. + Also loads a configuration file for Sources and passes the arguments in + the named section to the source :param rel_dir: A relative path to a directory. """ path = os.path.dirname(os.path.abspath(__file__)) path += "/" + rel_dir known_parser = set() + config = Configurator.read_sourceconfiguration() + for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']: mod = __import__('.'.join([rel_dir.replace("../", "").replace("/", "."), py]), fromlist=[py]) classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))] for cls in classes: if issubclass(cls, Source) and cls not in known_parser: - self.sources.append(cls()) # [review] - Would we ever need arguments for the parsers? + sourcecfg = Configurator.get_section(config, cls.__name__) + self.sources.append(cls(sourcecfg)) known_parser.add(cls) def include(self, source_names): @@ -55,4 +60,4 @@ class SourceLoader: string += "Source: " + src.__class__.__name__ string += " - " string += "URI: " + src.website + "\n" - return string \ No newline at end of file + return string