Merge branch 'release/v0.5.0'
This commit is contained in:
commit
16248577b0
3
.gitignore
vendored
3
.gitignore
vendored
@ -4,6 +4,9 @@
|
|||||||
#Python Specific ignores
|
#Python Specific ignores
|
||||||
*.pyc
|
*.pyc
|
||||||
|
|
||||||
|
#may contain authentication information
|
||||||
|
sources.cfg
|
||||||
|
|
||||||
#THINGS WE WOULD NEVER EVER WANT!
|
#THINGS WE WOULD NEVER EVER WANT!
|
||||||
#ignore thumbnails created by windows
|
#ignore thumbnails created by windows
|
||||||
Thumbs.db
|
Thumbs.db
|
||||||
|
@ -10,7 +10,7 @@ install:
|
|||||||
|
|
||||||
# command to run tests, e.g. python setup.py test
|
# command to run tests, e.g. python setup.py test
|
||||||
script:
|
script:
|
||||||
- nosetests --with-coverage --cover-package=FourmiCrawler tests
|
- nosetests --with-coverage --cover-package=FourmiCrawler,utils tests
|
||||||
|
|
||||||
notifications:
|
notifications:
|
||||||
slack: descartes2:6sgCzx3PvrO9IIMwKxj12dDM
|
slack: descartes2:6sgCzx3PvrO9IIMwKxj12dDM
|
||||||
|
@ -9,7 +9,7 @@ from FourmiCrawler.items import Result
|
|||||||
|
|
||||||
|
|
||||||
# [TODO] - Maybe clean up usage of '.extract()[0]', because of possible IndexError exception.
|
# [TODO] - Maybe clean up usage of '.extract()[0]', because of possible IndexError exception.
|
||||||
|
# [TODO] - Add checks at search request and extendedCompoundInfo on whether the token was valid or not
|
||||||
|
|
||||||
class ChemSpider(Source):
|
class ChemSpider(Source):
|
||||||
"""ChemSpider scraper for synonyms and properties
|
"""ChemSpider scraper for synonyms and properties
|
||||||
@ -20,19 +20,23 @@ class ChemSpider(Source):
|
|||||||
somewhere.
|
somewhere.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
Source.__init__(self)
|
|
||||||
|
|
||||||
website = 'http://www.chemspider.com/*'
|
website = 'http://www.chemspider.com/*'
|
||||||
|
|
||||||
# [TODO] - Save and access token of specific user.
|
search = 'Search.asmx/SimpleSearch?query=%s&token='
|
||||||
search = ('Search.asmx/SimpleSearch?query=%s&token='
|
|
||||||
'052bfd06-5ce4-43d6-bf12-89eabefd2338')
|
|
||||||
structure = 'Chemical-Structure.%s.html'
|
structure = 'Chemical-Structure.%s.html'
|
||||||
extendedinfo = ('MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token='
|
extendedinfo = 'MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token='
|
||||||
'052bfd06-5ce4-43d6-bf12-89eabefd2338')
|
|
||||||
|
def __init__(self, config={}):
|
||||||
|
Source.__init__(self, config)
|
||||||
|
self.cfg = config
|
||||||
|
self.ignore_list = []
|
||||||
|
if 'token' not in self.cfg or self.cfg['token'] == '':
|
||||||
|
log.msg('ChemSpider token not set or empty, search/MassSpec API '
|
||||||
|
'not available', level=log.WARNING)
|
||||||
|
self.cfg['token'] = ''
|
||||||
|
self.search += self.cfg['token']
|
||||||
|
self.extendedinfo += self.cfg['token']
|
||||||
|
|
||||||
ignore_list = []
|
|
||||||
|
|
||||||
def parse(self, response):
|
def parse(self, response):
|
||||||
sel = Selector(response)
|
sel = Selector(response)
|
||||||
@ -44,8 +48,7 @@ class ChemSpider(Source):
|
|||||||
|
|
||||||
return requests
|
return requests
|
||||||
|
|
||||||
@staticmethod
|
def parse_properties(self, sel):
|
||||||
def parse_properties(sel):
|
|
||||||
"""scrape Experimental Data and Predicted ACD/Labs tabs"""
|
"""scrape Experimental Data and Predicted ACD/Labs tabs"""
|
||||||
properties = []
|
properties = []
|
||||||
|
|
||||||
@ -76,13 +79,12 @@ class ChemSpider(Source):
|
|||||||
prop_value = m.group(1)
|
prop_value = m.group(1)
|
||||||
prop_conditions = m.group(2)
|
prop_conditions = m.group(2)
|
||||||
|
|
||||||
new_prop = Result({
|
new_prop = self.newresult(
|
||||||
'attribute': prop_name,
|
attribute=prop_name,
|
||||||
'value': prop_value,
|
value=prop_value,
|
||||||
'source': 'ChemSpider Predicted - ACD/Labs Tab',
|
source='ChemSpider Predicted - ACD/Labs Tab',
|
||||||
'reliability': 'Unknown',
|
conditions=prop_conditions
|
||||||
'conditions': prop_conditions
|
)
|
||||||
})
|
|
||||||
properties.append(new_prop)
|
properties.append(new_prop)
|
||||||
log.msg('CS prop: |%s| |%s| |%s|' %
|
log.msg('CS prop: |%s| |%s| |%s|' %
|
||||||
(new_prop['attribute'], new_prop['value'], new_prop['source']),
|
(new_prop['attribute'], new_prop['value'], new_prop['source']),
|
||||||
@ -100,14 +102,11 @@ class ChemSpider(Source):
|
|||||||
if line.xpath('span/text()'):
|
if line.xpath('span/text()'):
|
||||||
property_name = line.xpath('span/text()').extract()[0].rstrip()
|
property_name = line.xpath('span/text()').extract()[0].rstrip()
|
||||||
else:
|
else:
|
||||||
new_prop = Result({
|
new_prop = self.newresult(
|
||||||
'attribute': property_name[:-1],
|
attribute=property_name[:-1],
|
||||||
'value': line.xpath('text()').extract()[0].rstrip(),
|
value=line.xpath('text()').extract()[0].rstrip(),
|
||||||
'source': line.xpath(
|
source=line.xpath('strong/text()').extract()[0].rstrip(),
|
||||||
'strong/text()').extract()[0].rstrip(),
|
)
|
||||||
'reliability': 'Unknown',
|
|
||||||
'conditions': ''
|
|
||||||
})
|
|
||||||
properties.append(new_prop)
|
properties.append(new_prop)
|
||||||
log.msg('CS prop: |%s| |%s| |%s|' %
|
log.msg('CS prop: |%s| |%s| |%s|' %
|
||||||
(new_prop['attribute'], new_prop['value'],
|
(new_prop['attribute'], new_prop['value'],
|
||||||
@ -183,25 +182,31 @@ class ChemSpider(Source):
|
|||||||
}
|
}
|
||||||
return synonym
|
return synonym
|
||||||
|
|
||||||
@staticmethod
|
def parse_extendedinfo(self, response):
|
||||||
def parse_extendedinfo(response):
|
|
||||||
"""Scrape data from the ChemSpider GetExtendedCompoundInfo API"""
|
"""Scrape data from the ChemSpider GetExtendedCompoundInfo API"""
|
||||||
sel = Selector(response)
|
sel = Selector(response)
|
||||||
properties = []
|
properties = []
|
||||||
names = sel.xpath('*').xpath('name()').extract()
|
names = sel.xpath('*').xpath('name()').extract()
|
||||||
values = sel.xpath('*').xpath('text()').extract()
|
values = sel.xpath('*').xpath('text()').extract()
|
||||||
for (name, value) in zip(names, values):
|
for (name, value) in zip(names, values):
|
||||||
result = Result({
|
result = self.newresult(
|
||||||
'attribute': name,
|
attribute=name,
|
||||||
'value': value, # These values have no unit!
|
value=value, # These values have no unit!
|
||||||
'source': 'ChemSpider ExtendedCompoundInfo',
|
source='ChemSpider ExtendedCompoundInfo',
|
||||||
'reliability': 'Unknown',
|
)
|
||||||
'conditions': ''
|
|
||||||
})
|
|
||||||
if result['value']:
|
if result['value']:
|
||||||
properties.append(result)
|
properties.append(result)
|
||||||
return properties
|
return properties
|
||||||
|
|
||||||
|
def newresult(self, attribute, value, conditions='', source='ChemSpider'):
|
||||||
|
return Result({
|
||||||
|
'attribute': attribute,
|
||||||
|
'value': value,
|
||||||
|
'source': source,
|
||||||
|
'reliability': self.cfg['reliability'],
|
||||||
|
'conditions': conditions
|
||||||
|
})
|
||||||
|
|
||||||
def parse_searchrequest(self, response):
|
def parse_searchrequest(self, response):
|
||||||
"""Parse the initial response of the ChemSpider Search API """
|
"""Parse the initial response of the ChemSpider Search API """
|
||||||
sel = Selector(response)
|
sel = Selector(response)
|
||||||
@ -224,7 +229,7 @@ class ChemSpider(Source):
|
|||||||
callback=self.parse_extendedinfo)]
|
callback=self.parse_extendedinfo)]
|
||||||
|
|
||||||
def new_compound_request(self, compound):
|
def new_compound_request(self, compound):
|
||||||
if compound in self.ignore_list: # [TODO] - add regular expression
|
if compound in self.ignore_list or self.cfg['token'] == '':
|
||||||
return None
|
return None
|
||||||
searchurl = self.website[:-1] + self.search % compound
|
searchurl = self.website[:-1] + self.search % compound
|
||||||
log.msg('chemspider compound', level=log.DEBUG)
|
log.msg('chemspider compound', level=log.DEBUG)
|
||||||
|
@ -22,10 +22,12 @@ class NIST(Source):
|
|||||||
|
|
||||||
search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on'
|
search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on'
|
||||||
|
|
||||||
ignore_list = set()
|
cfg = {}
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self, config={}):
|
||||||
Source.__init__(self)
|
Source.__init__(self, config)
|
||||||
|
self.ignore_list = set()
|
||||||
|
self.cfg = config
|
||||||
|
|
||||||
def parse(self, response):
|
def parse(self, response):
|
||||||
sel = Selector(response)
|
sel = Selector(response)
|
||||||
@ -114,13 +116,10 @@ class NIST(Source):
|
|||||||
|
|
||||||
requests = []
|
requests = []
|
||||||
for key, value in data.iteritems():
|
for key, value in data.iteritems():
|
||||||
result = Result({
|
result = self.newresult(
|
||||||
'attribute': key,
|
attribute=key,
|
||||||
'value': value,
|
value=value
|
||||||
'source': 'NIST',
|
)
|
||||||
'reliability': 'Unknown',
|
|
||||||
'conditions': ''
|
|
||||||
})
|
|
||||||
requests.append(result)
|
requests.append(result)
|
||||||
|
|
||||||
return requests
|
return requests
|
||||||
@ -150,19 +149,16 @@ class NIST(Source):
|
|||||||
name = m.group(1)
|
name = m.group(1)
|
||||||
condition = m.group(2)
|
condition = m.group(2)
|
||||||
|
|
||||||
result = Result({
|
result = self.newresult(
|
||||||
'attribute': name,
|
attribute=name,
|
||||||
'value': data[1] + ' ' + data[2],
|
value=data[1] + ' ' + data[2],
|
||||||
'source': 'NIST',
|
conditions=condition
|
||||||
'reliability': 'Unknown',
|
)
|
||||||
'conditions': condition
|
|
||||||
})
|
|
||||||
log.msg('NIST: |%s|' % data, level=log.DEBUG)
|
log.msg('NIST: |%s|' % data, level=log.DEBUG)
|
||||||
results.append(result)
|
results.append(result)
|
||||||
return results
|
return results
|
||||||
|
|
||||||
@staticmethod
|
def parse_transition_data(self, table, summary):
|
||||||
def parse_transition_data(table, summary):
|
|
||||||
"""Parses the table containing properties regarding phase changes"""
|
"""Parses the table containing properties regarding phase changes"""
|
||||||
results = []
|
results = []
|
||||||
|
|
||||||
@ -174,19 +170,16 @@ class NIST(Source):
|
|||||||
|
|
||||||
for tr in table.xpath('tr[td]'):
|
for tr in table.xpath('tr[td]'):
|
||||||
tds = tr.xpath('td/text()').extract()
|
tds = tr.xpath('td/text()').extract()
|
||||||
result = Result({
|
result = self.newresult(
|
||||||
'attribute': summary,
|
attribute=summary,
|
||||||
'value': tds[0] + ' ' + unit,
|
value=tds[0] + ' ' + unit,
|
||||||
'source': 'NIST',
|
conditions='%s K, (%s -> %s)' % (tds[1], tds[2], tds[3])
|
||||||
'reliability': 'Unknown',
|
)
|
||||||
'conditions': '%s K, (%s -> %s)' % (tds[1], tds[2], tds[3])
|
|
||||||
})
|
|
||||||
results.append(result)
|
results.append(result)
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
@staticmethod
|
def parse_generic_data(self, table, summary):
|
||||||
def parse_generic_data(table, summary):
|
|
||||||
"""Parses the common tables of 4 and 5 rows. Assumes they are of the
|
"""Parses the common tables of 4 and 5 rows. Assumes they are of the
|
||||||
form:
|
form:
|
||||||
Symbol (unit)|Temperature (K)|Method|Reference|Comment
|
Symbol (unit)|Temperature (K)|Method|Reference|Comment
|
||||||
@ -202,36 +195,30 @@ class NIST(Source):
|
|||||||
|
|
||||||
for tr in table.xpath('tr[td]'):
|
for tr in table.xpath('tr[td]'):
|
||||||
tds = tr.xpath('td/text()').extract()
|
tds = tr.xpath('td/text()').extract()
|
||||||
result = Result({
|
result = self.newresult(
|
||||||
'attribute': summary,
|
attribute=summary,
|
||||||
'value': tds[0] + ' ' + unit,
|
value=tds[0] + ' ' + unit,
|
||||||
'source': 'NIST',
|
conditions='%s K' % tds[1]
|
||||||
'reliability': 'Unknown',
|
)
|
||||||
'conditions': '%s K' % tds[1]
|
|
||||||
})
|
|
||||||
results.append(result)
|
results.append(result)
|
||||||
return results
|
return results
|
||||||
|
|
||||||
@staticmethod
|
def parse_antoine_data(self, table, summary):
|
||||||
def parse_antoine_data(table, summary):
|
|
||||||
"""Parse table containing parameters for the Antione equation"""
|
"""Parse table containing parameters for the Antione equation"""
|
||||||
results = []
|
results = []
|
||||||
|
|
||||||
for tr in table.xpath('tr[td]'):
|
for tr in table.xpath('tr[td]'):
|
||||||
tds = tr.xpath('td/text()').extract()
|
tds = tr.xpath('td/text()').extract()
|
||||||
result = Result({
|
result = self.newresult(
|
||||||
'attribute': summary,
|
attribute=summary,
|
||||||
'value': 'A=%s, B=%s, C=%s' % (tds[1], tds[2], tds[3]),
|
value='A=%s, B=%s, C=%s' % (tds[1], tds[2], tds[3]),
|
||||||
'source': 'NIST',
|
conditions='%s K' % tds[0]
|
||||||
'reliability': 'Unknown',
|
)
|
||||||
'conditions': '%s K' % tds[0]
|
|
||||||
})
|
|
||||||
results.append(result)
|
results.append(result)
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
@staticmethod
|
def parse_individual_datapoints(self, response):
|
||||||
def parse_individual_datapoints(response):
|
|
||||||
"""Parses the page linked from aggregate data"""
|
"""Parses the page linked from aggregate data"""
|
||||||
sel = Selector(response)
|
sel = Selector(response)
|
||||||
table = sel.xpath('//table[@class="data"]')[0]
|
table = sel.xpath('//table[@class="data"]')[0]
|
||||||
@ -258,17 +245,24 @@ class NIST(Source):
|
|||||||
if m:
|
if m:
|
||||||
uncertainty = '+- %s ' % m.group(1)
|
uncertainty = '+- %s ' % m.group(1)
|
||||||
# [TODO]: get the plusminus sign working in here
|
# [TODO]: get the plusminus sign working in here
|
||||||
result = Result({
|
result = self.newresult(
|
||||||
'attribute': name,
|
attribute=name,
|
||||||
'value': '%s %s%s' % (tds[0], uncertainty, unit),
|
value='%s %s%s' % (tds[0], uncertainty, unit),
|
||||||
'source': 'NIST',
|
conditions=condition
|
||||||
'reliability': 'Unknown',
|
)
|
||||||
'conditions': condition
|
|
||||||
})
|
|
||||||
results.append(result)
|
results.append(result)
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
def newresult(self, attribute, value, conditions=''):
|
||||||
|
return Result({
|
||||||
|
'attribute': attribute,
|
||||||
|
'value': value,
|
||||||
|
'source': 'NIST',
|
||||||
|
'reliability': self.cfg['reliability'],
|
||||||
|
'conditions': conditions
|
||||||
|
})
|
||||||
|
|
||||||
def new_compound_request(self, compound):
|
def new_compound_request(self, compound):
|
||||||
if compound not in self.ignore_list:
|
if compound not in self.ignore_list:
|
||||||
self.ignore_list.update(compound)
|
self.ignore_list.update(compound)
|
||||||
|
@ -19,8 +19,11 @@ class WikipediaParser(Source):
|
|||||||
__spider = None
|
__spider = None
|
||||||
searched_compounds = []
|
searched_compounds = []
|
||||||
|
|
||||||
def __init__(self):
|
cfg = {}
|
||||||
Source.__init__(self)
|
|
||||||
|
def __init__(self, config={}):
|
||||||
|
Source.__init__(self, config)
|
||||||
|
self.cfg = config
|
||||||
|
|
||||||
def parse(self, response):
|
def parse(self, response):
|
||||||
""" Distributes the above described behaviour """
|
""" Distributes the above described behaviour """
|
||||||
@ -44,13 +47,10 @@ class WikipediaParser(Source):
|
|||||||
prop_names = tr_list[::2]
|
prop_names = tr_list[::2]
|
||||||
prop_values = tr_list[1::2]
|
prop_values = tr_list[1::2]
|
||||||
for i, prop_name in enumerate(prop_names):
|
for i, prop_name in enumerate(prop_names):
|
||||||
item = Result({
|
item = self.newresult(
|
||||||
'attribute': prop_name.extract().encode('utf-8'),
|
attribute=prop_name.extract().encode('utf-8'),
|
||||||
'value': prop_values[i].extract().encode('utf-8'),
|
value=prop_values[i].extract().encode('utf-8')
|
||||||
'source': "Wikipedia",
|
)
|
||||||
'reliability': "Unknown",
|
|
||||||
'conditions': ""
|
|
||||||
})
|
|
||||||
items.append(item)
|
items.append(item)
|
||||||
log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG)
|
log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG)
|
||||||
|
|
||||||
@ -61,13 +61,10 @@ class WikipediaParser(Source):
|
|||||||
log.msg('item: %s' % tablerow.xpath('./th').xpath('normalize-space(string())'), level=log.DEBUG)
|
log.msg('item: %s' % tablerow.xpath('./th').xpath('normalize-space(string())'), level=log.DEBUG)
|
||||||
if tablerow.xpath('./th').xpath('normalize-space(string())') and tablerow.xpath('./td').xpath(
|
if tablerow.xpath('./th').xpath('normalize-space(string())') and tablerow.xpath('./td').xpath(
|
||||||
'normalize-space(string())'):
|
'normalize-space(string())'):
|
||||||
item = Result({
|
item = self.newresult(
|
||||||
'attribute': tablerow.xpath('./th').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
|
attribute=tablerow.xpath('./th').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
|
||||||
'value': tablerow.xpath('./td').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
|
value=tablerow.xpath('./td').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
|
||||||
'source': "Wikipedia",
|
)
|
||||||
'reliability': "Unknown",
|
|
||||||
'conditions': ""
|
|
||||||
})
|
|
||||||
items.append(item)
|
items.append(item)
|
||||||
log.msg(
|
log.msg(
|
||||||
'Wiki prop: |attribute: %s| |value: %s| |%s|' % (item['attribute'], item['value'], item['source']),
|
'Wiki prop: |attribute: %s| |value: %s| |%s|' % (item['attribute'], item['value'], item['source']),
|
||||||
@ -116,4 +113,13 @@ class WikipediaParser(Source):
|
|||||||
""" find external links, named 'Identifiers' to different sources. """
|
""" find external links, named 'Identifiers' to different sources. """
|
||||||
links = sel.xpath('//span[contains(concat(" ",normalize-space(@class)," "),"reflink")]/a'
|
links = sel.xpath('//span[contains(concat(" ",normalize-space(@class)," "),"reflink")]/a'
|
||||||
'[contains(concat(" ",normalize-space(@class)," "),"external")]/@href').extract()
|
'[contains(concat(" ",normalize-space(@class)," "),"external")]/@href').extract()
|
||||||
return links
|
return links
|
||||||
|
|
||||||
|
def newresult(self, attribute, value):
|
||||||
|
return Result({
|
||||||
|
'attribute': attribute,
|
||||||
|
'value': value,
|
||||||
|
'source': 'Wikipedia',
|
||||||
|
'reliability': self.cfg['reliability'],
|
||||||
|
'conditions': ''
|
||||||
|
})
|
||||||
|
@ -6,7 +6,7 @@ class Source:
|
|||||||
website = "http://something/*" # Regex of URI's the source is able to parse
|
website = "http://something/*" # Regex of URI's the source is able to parse
|
||||||
_spider = None
|
_spider = None
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self, config={}):
|
||||||
"""
|
"""
|
||||||
Initiation of a new Source
|
Initiation of a new Source
|
||||||
"""
|
"""
|
||||||
|
@ -9,8 +9,6 @@ class FourmiSpider(Spider):
|
|||||||
A spider writen for the Fourmi Project which calls upon all available sources to request and scrape data.
|
A spider writen for the Fourmi Project which calls upon all available sources to request and scrape data.
|
||||||
"""
|
"""
|
||||||
name = "FourmiSpider"
|
name = "FourmiSpider"
|
||||||
_sources = []
|
|
||||||
synonyms = set()
|
|
||||||
|
|
||||||
def __init__(self, compound=None, selected_attributes=[".*"], *args, **kwargs):
|
def __init__(self, compound=None, selected_attributes=[".*"], *args, **kwargs):
|
||||||
"""
|
"""
|
||||||
@ -18,6 +16,8 @@ class FourmiSpider(Spider):
|
|||||||
:param compound: compound that will be searched.
|
:param compound: compound that will be searched.
|
||||||
:param selected_attributes: A list of regular expressions that the attributes should match.
|
:param selected_attributes: A list of regular expressions that the attributes should match.
|
||||||
"""
|
"""
|
||||||
|
self._sources = []
|
||||||
|
self.synonyms = set()
|
||||||
super(FourmiSpider, self).__init__(*args, **kwargs)
|
super(FourmiSpider, self).__init__(*args, **kwargs)
|
||||||
self.synonyms.add(compound)
|
self.synonyms.add(compound)
|
||||||
self.selected_attributes = selected_attributes
|
self.selected_attributes = selected_attributes
|
||||||
|
@ -1,8 +1,8 @@
|
|||||||
# Fourmi
|
# Fourmi
|
||||||
|
|
||||||
**Master branch**: [](https://travis-ci.org/Recondor/Fourmi)
|
**Master branch**: [](https://travis-ci.org/jjdekker/Fourmi) [](https://coveralls.io/r/jjdekker/Fourmi?branch=master)
|
||||||
|
|
||||||
**Developing branch**: [](https://travis-ci.org/Recondor/Fourmi)
|
**Developing branch**: [](https://travis-ci.org/jjdekker/Fourmi) [](https://coveralls.io/r/jjdekker/Fourmi?branch=develop)
|
||||||
|
|
||||||
Fourmi is an web scraper for chemical substances. The program is designed to be
|
Fourmi is an web scraper for chemical substances. The program is designed to be
|
||||||
used as a search engine to search multiple chemical databases for a specific
|
used as a search engine to search multiple chemical databases for a specific
|
||||||
|
54
fourmi.py
54
fourmi.py
@ -17,8 +17,8 @@ Options:
|
|||||||
--version Show version.
|
--version Show version.
|
||||||
--verbose Verbose logging output.
|
--verbose Verbose logging output.
|
||||||
--log=<file> Save log to an file.
|
--log=<file> Save log to an file.
|
||||||
-o <file> --output=<file> Output file [default: result.*format*]
|
-o <file> --output=<file> Output file [default: results.*format*]
|
||||||
-f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: jsonlines]
|
-f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: csv]
|
||||||
--include=<regex> Include only sources that match these regular expressions split by a comma.
|
--include=<regex> Include only sources that match these regular expressions split by a comma.
|
||||||
--exclude=<regex> Exclude the sources that match these regular expressions split by a comma.
|
--exclude=<regex> Exclude the sources that match these regular expressions split by a comma.
|
||||||
"""
|
"""
|
||||||
@ -30,7 +30,8 @@ from scrapy.utils.project import get_project_settings
|
|||||||
import docopt
|
import docopt
|
||||||
|
|
||||||
from FourmiCrawler.spider import FourmiSpider
|
from FourmiCrawler.spider import FourmiSpider
|
||||||
from sourceloader import SourceLoader
|
from utils.configurator import Configurator
|
||||||
|
from utils.sourceloader import SourceLoader
|
||||||
|
|
||||||
|
|
||||||
def setup_crawler(compound, settings, source_loader, attributes):
|
def setup_crawler(compound, settings, source_loader, attributes):
|
||||||
@ -50,59 +51,22 @@ def setup_crawler(compound, settings, source_loader, attributes):
|
|||||||
crawler.start()
|
crawler.start()
|
||||||
|
|
||||||
|
|
||||||
def scrapy_settings_manipulation(docopt_arguments):
|
|
||||||
"""
|
|
||||||
This function manipulates the Scrapy settings that normally would be set in the settings file. In the Fourmi
|
|
||||||
project these are command line arguments.
|
|
||||||
:param docopt_arguments: A dictionary generated by docopt containing all CLI arguments.
|
|
||||||
"""
|
|
||||||
settings = get_project_settings()
|
|
||||||
|
|
||||||
if docopt_arguments["--output"] != 'result.*format*':
|
|
||||||
settings.overrides["FEED_URI"] = docopt_arguments["--output"]
|
|
||||||
elif docopt_arguments["--format"] == "jsonlines":
|
|
||||||
settings.overrides["FEED_URI"] = "results.json"
|
|
||||||
elif docopt_arguments["--format"] is not None:
|
|
||||||
settings.overrides["FEED_URI"] = "results." + docopt_arguments["--format"]
|
|
||||||
|
|
||||||
if docopt_arguments["--format"] is not None:
|
|
||||||
settings.overrides["FEED_FORMAT"] = docopt_arguments["--format"]
|
|
||||||
|
|
||||||
return settings
|
|
||||||
|
|
||||||
|
|
||||||
def start_log(docopt_arguments):
|
|
||||||
"""
|
|
||||||
This function starts the logging functionality of Scrapy using the settings given by the CLI.
|
|
||||||
:param docopt_arguments: A dictionary generated by docopt containing all CLI arguments.
|
|
||||||
"""
|
|
||||||
if docopt_arguments["--log"] is not None:
|
|
||||||
if docopt_arguments["--verbose"]:
|
|
||||||
log.start(logfile=docopt_arguments["--log"], logstdout=False, loglevel=log.DEBUG)
|
|
||||||
else:
|
|
||||||
log.start(logfile=docopt_arguments["--log"], logstdout=True, loglevel=log.WARNING)
|
|
||||||
else:
|
|
||||||
if docopt_arguments["--verbose"]:
|
|
||||||
log.start(logstdout=False, loglevel=log.DEBUG)
|
|
||||||
else:
|
|
||||||
log.start(logstdout=True, loglevel=log.WARNING)
|
|
||||||
|
|
||||||
|
|
||||||
def search(docopt_arguments, source_loader):
|
def search(docopt_arguments, source_loader):
|
||||||
"""
|
"""
|
||||||
The function that facilitates the search for a specific compound.
|
The function that facilitates the search for a specific compound.
|
||||||
:param docopt_arguments: A dictionary generated by docopt containing all CLI arguments.
|
:param docopt_arguments: A dictionary generated by docopt containing all CLI arguments.
|
||||||
:param source_loader: An initiated SourceLoader object pointed at the directory with the sources.
|
:param source_loader: An initiated SourceLoader object pointed at the directory with the sources.
|
||||||
"""
|
"""
|
||||||
start_log(docopt_arguments)
|
conf = Configurator()
|
||||||
settings = scrapy_settings_manipulation(docopt_arguments)
|
conf.start_log(docopt_arguments["--log"], docopt_arguments["--verbose"])
|
||||||
setup_crawler(docopt_arguments["<compound>"], settings, source_loader, docopt_arguments["--attributes"].split(','))
|
conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"])
|
||||||
|
setup_crawler(docopt_arguments["<compound>"], conf.scrapy_settings, source_loader, docopt_arguments["--attributes"].split(','))
|
||||||
reactor.run()
|
reactor.run()
|
||||||
|
|
||||||
|
|
||||||
# The start for the Fourmi Command Line interface.
|
# The start for the Fourmi Command Line interface.
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
arguments = docopt.docopt(__doc__, version='Fourmi - V0.4.2')
|
arguments = docopt.docopt(__doc__, version='Fourmi - V0.5.0')
|
||||||
loader = SourceLoader()
|
loader = SourceLoader()
|
||||||
|
|
||||||
if arguments["--include"]:
|
if arguments["--include"]:
|
||||||
|
50
tests/test_configurator.py
Normal file
50
tests/test_configurator.py
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
import unittest
|
||||||
|
from utils.configurator import Configurator
|
||||||
|
|
||||||
|
import ConfigParser
|
||||||
|
|
||||||
|
class TestConfigurator(unittest.TestCase):
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
self.conf = Configurator()
|
||||||
|
|
||||||
|
def test_set_output(self):
|
||||||
|
self.conf.set_output(filename="test.txt", fileformat="csv")
|
||||||
|
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.txt")
|
||||||
|
self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv")
|
||||||
|
|
||||||
|
self.conf.set_output("results.*format*", "jsonlines")
|
||||||
|
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.json")
|
||||||
|
self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "jsonlines")
|
||||||
|
|
||||||
|
self.conf.set_output("results.*format*", "csv")
|
||||||
|
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.csv")
|
||||||
|
self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv")
|
||||||
|
|
||||||
|
# def test_start_log(self):
|
||||||
|
# self.conf.start_log("test.log", True)
|
||||||
|
# self.conf.start_log("test.log", False)
|
||||||
|
# self.conf.start_log(None, True)
|
||||||
|
# self.conf.start_log(None, False)
|
||||||
|
|
||||||
|
def test_read_sourceconfiguration(self):
|
||||||
|
config = self.conf.read_sourceconfiguration()
|
||||||
|
self.assertIsInstance(config, ConfigParser.ConfigParser)
|
||||||
|
|
||||||
|
def test_get_section(self):
|
||||||
|
config = ConfigParser.ConfigParser()
|
||||||
|
section = self.conf.get_section(config, 'test')
|
||||||
|
self.assertIn('reliability', section)
|
||||||
|
self.assertEquals(section['reliability'], '')
|
||||||
|
|
||||||
|
config.set('DEFAULT', 'reliability', 'Low')
|
||||||
|
|
||||||
|
section = self.conf.get_section(config, 'test')
|
||||||
|
self.assertEquals(section['reliability'], 'Low')
|
||||||
|
|
||||||
|
config.add_section('test')
|
||||||
|
config.set('test', 'var', 'Maybe')
|
||||||
|
|
||||||
|
section = self.conf.get_section(config, 'test')
|
||||||
|
self.assertEquals(section['reliability'], 'Low')
|
||||||
|
self.assertEqual(section['var'], 'Maybe')
|
@ -1,6 +1,6 @@
|
|||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
from sourceloader import SourceLoader
|
from utils.sourceloader import SourceLoader
|
||||||
|
|
||||||
|
|
||||||
class TestSourceloader(unittest.TestCase):
|
class TestSourceloader(unittest.TestCase):
|
||||||
|
@ -3,7 +3,7 @@ import unittest
|
|||||||
from scrapy.http import Request
|
from scrapy.http import Request
|
||||||
|
|
||||||
from FourmiCrawler import spider
|
from FourmiCrawler import spider
|
||||||
from FourmiCrawler.sources.ChemSpider import ChemSpider
|
from FourmiCrawler.sources.NIST import NIST
|
||||||
from FourmiCrawler.sources.source import Source
|
from FourmiCrawler.sources.source import Source
|
||||||
|
|
||||||
|
|
||||||
@ -41,7 +41,7 @@ class TestFoumiSpider(unittest.TestCase):
|
|||||||
self.spi.add_source(src)
|
self.spi.add_source(src)
|
||||||
self.assertEqual(self.spi.start_requests(), [])
|
self.assertEqual(self.spi.start_requests(), [])
|
||||||
|
|
||||||
src2 = ChemSpider()
|
src2 = NIST()
|
||||||
self.spi.add_source(src2)
|
self.spi.add_source(src2)
|
||||||
requests = self.spi.start_requests()
|
requests = self.spi.start_requests()
|
||||||
self.assertGreater(len(requests), 0)
|
self.assertGreater(len(requests), 0)
|
||||||
@ -57,8 +57,8 @@ class TestFoumiSpider(unittest.TestCase):
|
|||||||
self.assertEqual(self.spi.get_synonym_requests("new_compound"), [])
|
self.assertEqual(self.spi.get_synonym_requests("new_compound"), [])
|
||||||
self.assertIn("new_compound", self.spi.synonyms)
|
self.assertIn("new_compound", self.spi.synonyms)
|
||||||
|
|
||||||
src2 = ChemSpider()
|
src2 = NIST()
|
||||||
self.spi.add_source(src2)
|
self.spi.add_source(src2)
|
||||||
self.assertIsInstance(self.spi.get_synonym_requests("other_compound")[0], Request)
|
self.assertIsInstance(self.spi.get_synonym_requests("other_compound")[0], Request)
|
||||||
self.assertIn("other_compound", self.spi.synonyms)
|
self.assertIn("other_compound", self.spi.synonyms)
|
||||||
self.assertEqual(self.spi.get_synonym_requests("other_compound"), [])
|
self.assertEqual(self.spi.get_synonym_requests("other_compound"), [])
|
||||||
|
0
utils/__init__.py
Normal file
0
utils/__init__.py
Normal file
81
utils/configurator.py
Normal file
81
utils/configurator.py
Normal file
@ -0,0 +1,81 @@
|
|||||||
|
from scrapy import log
|
||||||
|
from scrapy.utils.project import get_project_settings
|
||||||
|
import ConfigParser
|
||||||
|
|
||||||
|
class Configurator:
|
||||||
|
"""
|
||||||
|
A helper class in the fourmi class. This class is used to process the settings as set
|
||||||
|
from one of the Fourmi applications.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.scrapy_settings = get_project_settings()
|
||||||
|
|
||||||
|
|
||||||
|
def set_output(self, filename, fileformat):
|
||||||
|
"""
|
||||||
|
This function manipulates the Scrapy output file settings that normally would be set in the settings file.
|
||||||
|
In the Fourmi project these are command line arguments.
|
||||||
|
:param filename: The filename of the file where the output will be put.
|
||||||
|
:param fileformat: The format in which the output will be.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if filename != 'results.*format*':
|
||||||
|
self.scrapy_settings.overrides["FEED_URI"] = filename
|
||||||
|
elif fileformat == "jsonlines":
|
||||||
|
self.scrapy_settings.overrides["FEED_URI"] = "results.json"
|
||||||
|
elif fileformat is not None:
|
||||||
|
self.scrapy_settings.overrides["FEED_URI"] = "results." + fileformat
|
||||||
|
|
||||||
|
if fileformat is not None:
|
||||||
|
self.scrapy_settings.overrides["FEED_FORMAT"] = fileformat
|
||||||
|
|
||||||
|
|
||||||
|
def start_log(self, logfile, verbose):
|
||||||
|
"""
|
||||||
|
This function starts the logging functionality of Scrapy using the settings given by the CLI.
|
||||||
|
:param logfile: The location where the logfile will be saved.
|
||||||
|
:param verbose: A boolean value to switch between loglevels.
|
||||||
|
"""
|
||||||
|
if logfile is not None:
|
||||||
|
if verbose:
|
||||||
|
log.start(logfile=logfile, logstdout=False, loglevel=log.DEBUG)
|
||||||
|
else:
|
||||||
|
log.start(logfile=logfile, logstdout=True, loglevel=log.WARNING)
|
||||||
|
else:
|
||||||
|
if verbose:
|
||||||
|
log.start(logstdout=False, loglevel=log.DEBUG)
|
||||||
|
else:
|
||||||
|
log.start(logstdout=True, loglevel=log.WARNING)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def read_sourceconfiguration():
|
||||||
|
"""
|
||||||
|
This function reads sources.cfg in the main folder for configuration
|
||||||
|
variables for sources
|
||||||
|
:return a ConfigParser object of sources.cfg
|
||||||
|
"""
|
||||||
|
config = ConfigParser.ConfigParser()
|
||||||
|
config.read('sources.cfg') # [TODO]: should be softcoded eventually
|
||||||
|
return config
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_section(config, sourcename):
|
||||||
|
"""
|
||||||
|
This function reads a config section labeled in variable sourcename and
|
||||||
|
tests whether the reliability variable is set else set to empty string.
|
||||||
|
Return the default section if the labeled config section does not exist
|
||||||
|
:param config: a ConfigParser object
|
||||||
|
:param sourcename: the name of the section to be read
|
||||||
|
:return a dictionary of the section in the config labeled in sourcename
|
||||||
|
"""
|
||||||
|
section = dict()
|
||||||
|
if config.has_section(sourcename):
|
||||||
|
section = dict(config.items(sourcename))
|
||||||
|
elif config.defaults():
|
||||||
|
section = config.defaults()
|
||||||
|
if 'reliability' not in section:
|
||||||
|
log.msg('Reliability not set for %s' % sourcename,
|
||||||
|
level=log.WARNING)
|
||||||
|
section['reliability'] = ''
|
||||||
|
return section
|
@ -3,26 +3,31 @@ import os
|
|||||||
import re
|
import re
|
||||||
|
|
||||||
from FourmiCrawler.sources.source import Source
|
from FourmiCrawler.sources.source import Source
|
||||||
|
from utils.configurator import Configurator
|
||||||
|
|
||||||
class SourceLoader:
|
class SourceLoader:
|
||||||
sources = []
|
sources = []
|
||||||
|
|
||||||
def __init__(self, rel_dir="FourmiCrawler/sources"):
|
def __init__(self, rel_dir="../FourmiCrawler/sources"):
|
||||||
"""
|
"""
|
||||||
The initiation of a SourceLoader, selects and indexes a directory for usable sources.
|
The initiation of a SourceLoader, selects and indexes a directory for usable sources.
|
||||||
|
Also loads a configuration file for Sources and passes the arguments in
|
||||||
|
the named section to the source
|
||||||
:param rel_dir: A relative path to a directory.
|
:param rel_dir: A relative path to a directory.
|
||||||
"""
|
"""
|
||||||
path = os.path.dirname(os.path.abspath(__file__))
|
path = os.path.dirname(os.path.abspath(__file__))
|
||||||
path += "/" + rel_dir
|
path += "/" + rel_dir
|
||||||
known_parser = set()
|
known_parser = set()
|
||||||
|
|
||||||
|
config = Configurator.read_sourceconfiguration()
|
||||||
|
|
||||||
for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']:
|
for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']:
|
||||||
mod = __import__('.'.join([rel_dir.replace("/", "."), py]), fromlist=[py])
|
mod = __import__('.'.join([rel_dir.replace("../", "").replace("/", "."), py]), fromlist=[py])
|
||||||
classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))]
|
classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))]
|
||||||
for cls in classes:
|
for cls in classes:
|
||||||
if issubclass(cls, Source) and cls not in known_parser:
|
if issubclass(cls, Source) and cls not in known_parser:
|
||||||
self.sources.append(cls()) # [review] - Would we ever need arguments for the parsers?
|
sourcecfg = Configurator.get_section(config, cls.__name__)
|
||||||
|
self.sources.append(cls(sourcecfg))
|
||||||
known_parser.add(cls)
|
known_parser.add(cls)
|
||||||
|
|
||||||
def include(self, source_names):
|
def include(self, source_names):
|
||||||
@ -55,4 +60,4 @@ class SourceLoader:
|
|||||||
string += "Source: " + src.__class__.__name__
|
string += "Source: " + src.__class__.__name__
|
||||||
string += " - "
|
string += " - "
|
||||||
string += "URI: " + src.website + "\n"
|
string += "URI: " + src.website + "\n"
|
||||||
return string
|
return string
|
Reference in New Issue
Block a user