Merge branch 'feature/sources-configuration' into develop
This commit is contained in:
commit
1ab7d0ba76
3
.gitignore
vendored
3
.gitignore
vendored
@ -4,6 +4,9 @@
|
||||
#Python Specific ignores
|
||||
*.pyc
|
||||
|
||||
#may contain authentication information
|
||||
sources.cfg
|
||||
|
||||
#THINGS WE WOULD NEVER EVER WANT!
|
||||
#ignore thumbnails created by windows
|
||||
Thumbs.db
|
||||
|
@ -9,7 +9,7 @@ from FourmiCrawler.items import Result
|
||||
|
||||
|
||||
# [TODO] - Maybe clean up usage of '.extract()[0]', because of possible IndexError exception.
|
||||
|
||||
# [TODO] - Add checks at search request and extendedCompoundInfo on whether the token was valid or not
|
||||
|
||||
class ChemSpider(Source):
|
||||
"""ChemSpider scraper for synonyms and properties
|
||||
@ -20,19 +20,23 @@ class ChemSpider(Source):
|
||||
somewhere.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
Source.__init__(self)
|
||||
|
||||
website = 'http://www.chemspider.com/*'
|
||||
|
||||
# [TODO] - Save and access token of specific user.
|
||||
search = ('Search.asmx/SimpleSearch?query=%s&token='
|
||||
'052bfd06-5ce4-43d6-bf12-89eabefd2338')
|
||||
search = 'Search.asmx/SimpleSearch?query=%s&token='
|
||||
structure = 'Chemical-Structure.%s.html'
|
||||
extendedinfo = ('MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token='
|
||||
'052bfd06-5ce4-43d6-bf12-89eabefd2338')
|
||||
extendedinfo = 'MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token='
|
||||
|
||||
def __init__(self, config={}):
|
||||
Source.__init__(self, config)
|
||||
self.cfg = config
|
||||
self.ignore_list = []
|
||||
if 'token' not in self.cfg or self.cfg['token'] == '':
|
||||
log.msg('ChemSpider token not set or empty, search/MassSpec API '
|
||||
'not available', level=log.WARNING)
|
||||
self.cfg['token'] = ''
|
||||
self.search += self.cfg['token']
|
||||
self.extendedinfo += self.cfg['token']
|
||||
|
||||
ignore_list = []
|
||||
|
||||
def parse(self, response):
|
||||
sel = Selector(response)
|
||||
@ -44,8 +48,7 @@ class ChemSpider(Source):
|
||||
|
||||
return requests
|
||||
|
||||
@staticmethod
|
||||
def parse_properties(sel):
|
||||
def parse_properties(self, sel):
|
||||
"""scrape Experimental Data and Predicted ACD/Labs tabs"""
|
||||
properties = []
|
||||
|
||||
@ -76,13 +79,12 @@ class ChemSpider(Source):
|
||||
prop_value = m.group(1)
|
||||
prop_conditions = m.group(2)
|
||||
|
||||
new_prop = Result({
|
||||
'attribute': prop_name,
|
||||
'value': prop_value,
|
||||
'source': 'ChemSpider Predicted - ACD/Labs Tab',
|
||||
'reliability': 'Unknown',
|
||||
'conditions': prop_conditions
|
||||
})
|
||||
new_prop = self.newresult(
|
||||
attribute=prop_name,
|
||||
value=prop_value,
|
||||
source='ChemSpider Predicted - ACD/Labs Tab',
|
||||
conditions=prop_conditions
|
||||
)
|
||||
properties.append(new_prop)
|
||||
log.msg('CS prop: |%s| |%s| |%s|' %
|
||||
(new_prop['attribute'], new_prop['value'], new_prop['source']),
|
||||
@ -100,14 +102,11 @@ class ChemSpider(Source):
|
||||
if line.xpath('span/text()'):
|
||||
property_name = line.xpath('span/text()').extract()[0].rstrip()
|
||||
else:
|
||||
new_prop = Result({
|
||||
'attribute': property_name[:-1],
|
||||
'value': line.xpath('text()').extract()[0].rstrip(),
|
||||
'source': line.xpath(
|
||||
'strong/text()').extract()[0].rstrip(),
|
||||
'reliability': 'Unknown',
|
||||
'conditions': ''
|
||||
})
|
||||
new_prop = self.newresult(
|
||||
attribute=property_name[:-1],
|
||||
value=line.xpath('text()').extract()[0].rstrip(),
|
||||
source=line.xpath('strong/text()').extract()[0].rstrip(),
|
||||
)
|
||||
properties.append(new_prop)
|
||||
log.msg('CS prop: |%s| |%s| |%s|' %
|
||||
(new_prop['attribute'], new_prop['value'],
|
||||
@ -183,25 +182,31 @@ class ChemSpider(Source):
|
||||
}
|
||||
return synonym
|
||||
|
||||
@staticmethod
|
||||
def parse_extendedinfo(response):
|
||||
def parse_extendedinfo(self, response):
|
||||
"""Scrape data from the ChemSpider GetExtendedCompoundInfo API"""
|
||||
sel = Selector(response)
|
||||
properties = []
|
||||
names = sel.xpath('*').xpath('name()').extract()
|
||||
values = sel.xpath('*').xpath('text()').extract()
|
||||
for (name, value) in zip(names, values):
|
||||
result = Result({
|
||||
'attribute': name,
|
||||
'value': value, # These values have no unit!
|
||||
'source': 'ChemSpider ExtendedCompoundInfo',
|
||||
'reliability': 'Unknown',
|
||||
'conditions': ''
|
||||
})
|
||||
result = self.newresult(
|
||||
attribute=name,
|
||||
value=value, # These values have no unit!
|
||||
source='ChemSpider ExtendedCompoundInfo',
|
||||
)
|
||||
if result['value']:
|
||||
properties.append(result)
|
||||
return properties
|
||||
|
||||
def newresult(self, attribute, value, conditions='', source='ChemSpider'):
|
||||
return Result({
|
||||
'attribute': attribute,
|
||||
'value': value,
|
||||
'source': source,
|
||||
'reliability': self.cfg['reliability'],
|
||||
'conditions': conditions
|
||||
})
|
||||
|
||||
def parse_searchrequest(self, response):
|
||||
"""Parse the initial response of the ChemSpider Search API """
|
||||
sel = Selector(response)
|
||||
@ -224,7 +229,7 @@ class ChemSpider(Source):
|
||||
callback=self.parse_extendedinfo)]
|
||||
|
||||
def new_compound_request(self, compound):
|
||||
if compound in self.ignore_list: # [TODO] - add regular expression
|
||||
if compound in self.ignore_list or self.cfg['token'] == '':
|
||||
return None
|
||||
searchurl = self.website[:-1] + self.search % compound
|
||||
log.msg('chemspider compound', level=log.DEBUG)
|
||||
|
@ -22,10 +22,12 @@ class NIST(Source):
|
||||
|
||||
search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on'
|
||||
|
||||
ignore_list = set()
|
||||
cfg = {}
|
||||
|
||||
def __init__(self):
|
||||
Source.__init__(self)
|
||||
def __init__(self, config={}):
|
||||
Source.__init__(self, config)
|
||||
self.ignore_list = set()
|
||||
self.cfg = config
|
||||
|
||||
def parse(self, response):
|
||||
sel = Selector(response)
|
||||
@ -114,13 +116,10 @@ class NIST(Source):
|
||||
|
||||
requests = []
|
||||
for key, value in data.iteritems():
|
||||
result = Result({
|
||||
'attribute': key,
|
||||
'value': value,
|
||||
'source': 'NIST',
|
||||
'reliability': 'Unknown',
|
||||
'conditions': ''
|
||||
})
|
||||
result = self.newresult(
|
||||
attribute=key,
|
||||
value=value
|
||||
)
|
||||
requests.append(result)
|
||||
|
||||
return requests
|
||||
@ -150,19 +149,16 @@ class NIST(Source):
|
||||
name = m.group(1)
|
||||
condition = m.group(2)
|
||||
|
||||
result = Result({
|
||||
'attribute': name,
|
||||
'value': data[1] + ' ' + data[2],
|
||||
'source': 'NIST',
|
||||
'reliability': 'Unknown',
|
||||
'conditions': condition
|
||||
})
|
||||
result = self.newresult(
|
||||
attribute=name,
|
||||
value=data[1] + ' ' + data[2],
|
||||
conditions=condition
|
||||
)
|
||||
log.msg('NIST: |%s|' % data, level=log.DEBUG)
|
||||
results.append(result)
|
||||
return results
|
||||
|
||||
@staticmethod
|
||||
def parse_transition_data(table, summary):
|
||||
def parse_transition_data(self, table, summary):
|
||||
"""Parses the table containing properties regarding phase changes"""
|
||||
results = []
|
||||
|
||||
@ -174,19 +170,16 @@ class NIST(Source):
|
||||
|
||||
for tr in table.xpath('tr[td]'):
|
||||
tds = tr.xpath('td/text()').extract()
|
||||
result = Result({
|
||||
'attribute': summary,
|
||||
'value': tds[0] + ' ' + unit,
|
||||
'source': 'NIST',
|
||||
'reliability': 'Unknown',
|
||||
'conditions': '%s K, (%s -> %s)' % (tds[1], tds[2], tds[3])
|
||||
})
|
||||
result = self.newresult(
|
||||
attribute=summary,
|
||||
value=tds[0] + ' ' + unit,
|
||||
conditions='%s K, (%s -> %s)' % (tds[1], tds[2], tds[3])
|
||||
)
|
||||
results.append(result)
|
||||
|
||||
return results
|
||||
|
||||
@staticmethod
|
||||
def parse_generic_data(table, summary):
|
||||
def parse_generic_data(self, table, summary):
|
||||
"""Parses the common tables of 4 and 5 rows. Assumes they are of the
|
||||
form:
|
||||
Symbol (unit)|Temperature (K)|Method|Reference|Comment
|
||||
@ -202,36 +195,30 @@ class NIST(Source):
|
||||
|
||||
for tr in table.xpath('tr[td]'):
|
||||
tds = tr.xpath('td/text()').extract()
|
||||
result = Result({
|
||||
'attribute': summary,
|
||||
'value': tds[0] + ' ' + unit,
|
||||
'source': 'NIST',
|
||||
'reliability': 'Unknown',
|
||||
'conditions': '%s K' % tds[1]
|
||||
})
|
||||
result = self.newresult(
|
||||
attribute=summary,
|
||||
value=tds[0] + ' ' + unit,
|
||||
conditions='%s K' % tds[1]
|
||||
)
|
||||
results.append(result)
|
||||
return results
|
||||
|
||||
@staticmethod
|
||||
def parse_antoine_data(table, summary):
|
||||
def parse_antoine_data(self, table, summary):
|
||||
"""Parse table containing parameters for the Antione equation"""
|
||||
results = []
|
||||
|
||||
for tr in table.xpath('tr[td]'):
|
||||
tds = tr.xpath('td/text()').extract()
|
||||
result = Result({
|
||||
'attribute': summary,
|
||||
'value': 'A=%s, B=%s, C=%s' % (tds[1], tds[2], tds[3]),
|
||||
'source': 'NIST',
|
||||
'reliability': 'Unknown',
|
||||
'conditions': '%s K' % tds[0]
|
||||
})
|
||||
result = self.newresult(
|
||||
attribute=summary,
|
||||
value='A=%s, B=%s, C=%s' % (tds[1], tds[2], tds[3]),
|
||||
conditions='%s K' % tds[0]
|
||||
)
|
||||
results.append(result)
|
||||
|
||||
return results
|
||||
|
||||
@staticmethod
|
||||
def parse_individual_datapoints(response):
|
||||
def parse_individual_datapoints(self, response):
|
||||
"""Parses the page linked from aggregate data"""
|
||||
sel = Selector(response)
|
||||
table = sel.xpath('//table[@class="data"]')[0]
|
||||
@ -258,17 +245,24 @@ class NIST(Source):
|
||||
if m:
|
||||
uncertainty = '+- %s ' % m.group(1)
|
||||
# [TODO]: get the plusminus sign working in here
|
||||
result = Result({
|
||||
'attribute': name,
|
||||
'value': '%s %s%s' % (tds[0], uncertainty, unit),
|
||||
'source': 'NIST',
|
||||
'reliability': 'Unknown',
|
||||
'conditions': condition
|
||||
})
|
||||
result = self.newresult(
|
||||
attribute=name,
|
||||
value='%s %s%s' % (tds[0], uncertainty, unit),
|
||||
conditions=condition
|
||||
)
|
||||
results.append(result)
|
||||
|
||||
return results
|
||||
|
||||
def newresult(self, attribute, value, conditions=''):
|
||||
return Result({
|
||||
'attribute': attribute,
|
||||
'value': value,
|
||||
'source': 'NIST',
|
||||
'reliability': self.cfg['reliability'],
|
||||
'conditions': conditions
|
||||
})
|
||||
|
||||
def new_compound_request(self, compound):
|
||||
if compound not in self.ignore_list:
|
||||
self.ignore_list.update(compound)
|
||||
|
@ -19,8 +19,11 @@ class WikipediaParser(Source):
|
||||
__spider = None
|
||||
searched_compounds = []
|
||||
|
||||
def __init__(self):
|
||||
Source.__init__(self)
|
||||
cfg = {}
|
||||
|
||||
def __init__(self, config={}):
|
||||
Source.__init__(self, config)
|
||||
self.cfg = config
|
||||
|
||||
def parse(self, response):
|
||||
""" Distributes the above described behaviour """
|
||||
@ -44,13 +47,10 @@ class WikipediaParser(Source):
|
||||
prop_names = tr_list[::2]
|
||||
prop_values = tr_list[1::2]
|
||||
for i, prop_name in enumerate(prop_names):
|
||||
item = Result({
|
||||
'attribute': prop_name.extract().encode('utf-8'),
|
||||
'value': prop_values[i].extract().encode('utf-8'),
|
||||
'source': "Wikipedia",
|
||||
'reliability': "Unknown",
|
||||
'conditions': ""
|
||||
})
|
||||
item = self.newresult(
|
||||
attribute=prop_name.extract().encode('utf-8'),
|
||||
value=prop_values[i].extract().encode('utf-8')
|
||||
)
|
||||
items.append(item)
|
||||
log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG)
|
||||
|
||||
@ -61,13 +61,10 @@ class WikipediaParser(Source):
|
||||
log.msg('item: %s' % tablerow.xpath('./th').xpath('normalize-space(string())'), level=log.DEBUG)
|
||||
if tablerow.xpath('./th').xpath('normalize-space(string())') and tablerow.xpath('./td').xpath(
|
||||
'normalize-space(string())'):
|
||||
item = Result({
|
||||
'attribute': tablerow.xpath('./th').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
|
||||
'value': tablerow.xpath('./td').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
|
||||
'source': "Wikipedia",
|
||||
'reliability': "Unknown",
|
||||
'conditions': ""
|
||||
})
|
||||
item = self.newresult(
|
||||
attribute=tablerow.xpath('./th').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
|
||||
value=tablerow.xpath('./td').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
|
||||
)
|
||||
items.append(item)
|
||||
log.msg(
|
||||
'Wiki prop: |attribute: %s| |value: %s| |%s|' % (item['attribute'], item['value'], item['source']),
|
||||
@ -117,3 +114,12 @@ class WikipediaParser(Source):
|
||||
links = sel.xpath('//span[contains(concat(" ",normalize-space(@class)," "),"reflink")]/a'
|
||||
'[contains(concat(" ",normalize-space(@class)," "),"external")]/@href').extract()
|
||||
return links
|
||||
|
||||
def newresult(self, attribute, value):
|
||||
return Result({
|
||||
'attribute': attribute,
|
||||
'value': value,
|
||||
'source': 'Wikipedia',
|
||||
'reliability': self.cfg['reliability'],
|
||||
'conditions': ''
|
||||
})
|
||||
|
@ -6,7 +6,7 @@ class Source:
|
||||
website = "http://something/*" # Regex of URI's the source is able to parse
|
||||
_spider = None
|
||||
|
||||
def __init__(self):
|
||||
def __init__(self, config={}):
|
||||
"""
|
||||
Initiation of a new Source
|
||||
"""
|
||||
|
@ -9,8 +9,6 @@ class FourmiSpider(Spider):
|
||||
A spider writen for the Fourmi Project which calls upon all available sources to request and scrape data.
|
||||
"""
|
||||
name = "FourmiSpider"
|
||||
_sources = []
|
||||
synonyms = set()
|
||||
|
||||
def __init__(self, compound=None, selected_attributes=[".*"], *args, **kwargs):
|
||||
"""
|
||||
@ -18,6 +16,8 @@ class FourmiSpider(Spider):
|
||||
:param compound: compound that will be searched.
|
||||
:param selected_attributes: A list of regular expressions that the attributes should match.
|
||||
"""
|
||||
self._sources = []
|
||||
self.synonyms = set()
|
||||
super(FourmiSpider, self).__init__(*args, **kwargs)
|
||||
self.synonyms.add(compound)
|
||||
self.selected_attributes = selected_attributes
|
||||
|
@ -1,6 +1,7 @@
|
||||
import unittest
|
||||
from utils.configurator import Configurator
|
||||
|
||||
import ConfigParser
|
||||
|
||||
class TestConfigurator(unittest.TestCase):
|
||||
|
||||
@ -25,3 +26,25 @@ class TestConfigurator(unittest.TestCase):
|
||||
# self.conf.start_log("test.log", False)
|
||||
# self.conf.start_log(None, True)
|
||||
# self.conf.start_log(None, False)
|
||||
|
||||
def test_read_sourceconfiguration(self):
|
||||
config = self.conf.read_sourceconfiguration()
|
||||
self.assertIsInstance(config, ConfigParser.ConfigParser)
|
||||
|
||||
def test_get_section(self):
|
||||
config = ConfigParser.ConfigParser()
|
||||
section = self.conf.get_section(config, 'test')
|
||||
self.assertIn('reliability', section)
|
||||
self.assertEquals(section['reliability'], '')
|
||||
|
||||
config.set('DEFAULT', 'reliability', 'Low')
|
||||
|
||||
section = self.conf.get_section(config, 'test')
|
||||
self.assertEquals(section['reliability'], 'Low')
|
||||
|
||||
config.add_section('test')
|
||||
config.set('test', 'var', 'Maybe')
|
||||
|
||||
section = self.conf.get_section(config, 'test')
|
||||
self.assertEquals(section['reliability'], 'Low')
|
||||
self.assertEqual(section['var'], 'Maybe')
|
||||
|
@ -3,7 +3,7 @@ import unittest
|
||||
from scrapy.http import Request
|
||||
|
||||
from FourmiCrawler import spider
|
||||
from FourmiCrawler.sources.ChemSpider import ChemSpider
|
||||
from FourmiCrawler.sources.NIST import NIST
|
||||
from FourmiCrawler.sources.source import Source
|
||||
|
||||
|
||||
@ -41,7 +41,7 @@ class TestFoumiSpider(unittest.TestCase):
|
||||
self.spi.add_source(src)
|
||||
self.assertEqual(self.spi.start_requests(), [])
|
||||
|
||||
src2 = ChemSpider()
|
||||
src2 = NIST()
|
||||
self.spi.add_source(src2)
|
||||
requests = self.spi.start_requests()
|
||||
self.assertGreater(len(requests), 0)
|
||||
@ -57,7 +57,7 @@ class TestFoumiSpider(unittest.TestCase):
|
||||
self.assertEqual(self.spi.get_synonym_requests("new_compound"), [])
|
||||
self.assertIn("new_compound", self.spi.synonyms)
|
||||
|
||||
src2 = ChemSpider()
|
||||
src2 = NIST()
|
||||
self.spi.add_source(src2)
|
||||
self.assertIsInstance(self.spi.get_synonym_requests("other_compound")[0], Request)
|
||||
self.assertIn("other_compound", self.spi.synonyms)
|
||||
|
@ -1,6 +1,6 @@
|
||||
from scrapy import log
|
||||
from scrapy.utils.project import get_project_settings
|
||||
|
||||
import ConfigParser
|
||||
|
||||
class Configurator:
|
||||
"""
|
||||
@ -47,3 +47,35 @@ class Configurator:
|
||||
log.start(logstdout=False, loglevel=log.DEBUG)
|
||||
else:
|
||||
log.start(logstdout=True, loglevel=log.WARNING)
|
||||
|
||||
@staticmethod
|
||||
def read_sourceconfiguration():
|
||||
"""
|
||||
This function reads sources.cfg in the main folder for configuration
|
||||
variables for sources
|
||||
:return a ConfigParser object of sources.cfg
|
||||
"""
|
||||
config = ConfigParser.ConfigParser()
|
||||
config.read('sources.cfg') # [TODO]: should be softcoded eventually
|
||||
return config
|
||||
|
||||
@staticmethod
|
||||
def get_section(config, sourcename):
|
||||
"""
|
||||
This function reads a config section labeled in variable sourcename and
|
||||
tests whether the reliability variable is set else set to empty string.
|
||||
Return the default section if the labeled config section does not exist
|
||||
:param config: a ConfigParser object
|
||||
:param sourcename: the name of the section to be read
|
||||
:return a dictionary of the section in the config labeled in sourcename
|
||||
"""
|
||||
section = dict()
|
||||
if config.has_section(sourcename):
|
||||
section = dict(config.items(sourcename))
|
||||
elif config.defaults():
|
||||
section = config.defaults()
|
||||
if 'reliability' not in section:
|
||||
log.msg('Reliability not set for %s' % sourcename,
|
||||
level=log.WARNING)
|
||||
section['reliability'] = ''
|
||||
return section
|
||||
|
@ -3,7 +3,7 @@ import os
|
||||
import re
|
||||
|
||||
from FourmiCrawler.sources.source import Source
|
||||
|
||||
from utils.configurator import Configurator
|
||||
|
||||
class SourceLoader:
|
||||
sources = []
|
||||
@ -11,18 +11,23 @@ class SourceLoader:
|
||||
def __init__(self, rel_dir="../FourmiCrawler/sources"):
|
||||
"""
|
||||
The initiation of a SourceLoader, selects and indexes a directory for usable sources.
|
||||
Also loads a configuration file for Sources and passes the arguments in
|
||||
the named section to the source
|
||||
:param rel_dir: A relative path to a directory.
|
||||
"""
|
||||
path = os.path.dirname(os.path.abspath(__file__))
|
||||
path += "/" + rel_dir
|
||||
known_parser = set()
|
||||
|
||||
config = Configurator.read_sourceconfiguration()
|
||||
|
||||
for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']:
|
||||
mod = __import__('.'.join([rel_dir.replace("../", "").replace("/", "."), py]), fromlist=[py])
|
||||
classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))]
|
||||
for cls in classes:
|
||||
if issubclass(cls, Source) and cls not in known_parser:
|
||||
self.sources.append(cls()) # [review] - Would we ever need arguments for the parsers?
|
||||
sourcecfg = Configurator.get_section(config, cls.__name__)
|
||||
self.sources.append(cls(sourcecfg))
|
||||
known_parser.add(cls)
|
||||
|
||||
def include(self, source_names):
|
||||
|
Reference in New Issue
Block a user