Merge branch 'develop' into feature/GUI
This commit is contained in:
commit
520216b528
@ -1,3 +1,7 @@
|
|||||||
|
### v0.6.0
|
||||||
|
- FIX: Using absolute path for configuration files
|
||||||
|
- DEV: General Code cleanup in documentation
|
||||||
|
|
||||||
### v0.5.3
|
### v0.5.3
|
||||||
- FIX: It is now again possible to use both verbose and the source inclusion/exclusion options
|
- FIX: It is now again possible to use both verbose and the source inclusion/exclusion options
|
||||||
- FIX: Logging is now "actually" disabled if not using the verbose option.
|
- FIX: Logging is now "actually" disabled if not using the verbose option.
|
||||||
|
@ -21,7 +21,4 @@ FEED_FORMAT = 'jsonlines'
|
|||||||
# Crawl responsibly by identifying yourself (and your website) on the
|
# Crawl responsibly by identifying yourself (and your website) on the
|
||||||
# user-agent
|
# user-agent
|
||||||
|
|
||||||
# [todo] - Check for repercussions on spoofing the user agent
|
USER_AGENT = 'Fourmi'
|
||||||
|
|
||||||
# USER_AGENT = 'FourmiCrawler (+http://www.yourdomain.com)'
|
|
||||||
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36'
|
|
||||||
|
@ -9,24 +9,28 @@ from FourmiCrawler.items import Result
|
|||||||
|
|
||||||
|
|
||||||
# [TODO] - Maybe clean up usage of '.extract()[0]', because of possible IndexError exception.
|
# [TODO] - Maybe clean up usage of '.extract()[0]', because of possible IndexError exception.
|
||||||
# [TODO] - Add checks at search request and extendedCompoundInfo on whether the token was valid or not
|
|
||||||
|
|
||||||
class ChemSpider(Source):
|
class ChemSpider(Source):
|
||||||
"""ChemSpider scraper for synonyms and properties
|
"""
|
||||||
|
ChemSpider scraper for synonyms and properties
|
||||||
This parser will manage searching for chemicals through the
|
This parser will manage searching for chemicals through the
|
||||||
ChemsSpider API, and parsing the resulting ChemSpider page.
|
ChemsSpider API, and parsing the resulting ChemSpider page.
|
||||||
The token required for the API should be in a configuration file
|
The token required for the API should be in a configuration file
|
||||||
somewhere.
|
somewhere.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
website = 'http://www.chemspider.com/*'
|
website = 'http://www\\.chemspider\\.com/.*'
|
||||||
|
|
||||||
search = 'Search.asmx/SimpleSearch?query=%s&token='
|
search = 'Search.asmx/SimpleSearch?query=%s&token='
|
||||||
structure = 'Chemical-Structure.%s.html'
|
structure = 'Chemical-Structure.%s.html'
|
||||||
extendedinfo = 'MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token='
|
extendedinfo = 'MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token='
|
||||||
|
|
||||||
def __init__(self, config=None):
|
def __init__(self, config=None):
|
||||||
|
"""
|
||||||
|
Initialization of ChemSpider scraper
|
||||||
|
:param config: a dictionary of settings for this scraper, must contain
|
||||||
|
'reliability' key
|
||||||
|
"""
|
||||||
Source.__init__(self, config)
|
Source.__init__(self, config)
|
||||||
self.ignore_list = []
|
self.ignore_list = []
|
||||||
if 'token' not in self.cfg or self.cfg['token'] == '':
|
if 'token' not in self.cfg or self.cfg['token'] == '':
|
||||||
@ -37,6 +41,12 @@ class ChemSpider(Source):
|
|||||||
self.extendedinfo += self.cfg['token']
|
self.extendedinfo += self.cfg['token']
|
||||||
|
|
||||||
def parse(self, response):
|
def parse(self, response):
|
||||||
|
"""
|
||||||
|
This function is called when a Response matching the variable
|
||||||
|
'website' is available for parsing the Response object.
|
||||||
|
:param response: the Scrapy Response object to be parsed
|
||||||
|
:return: a list of Result items and Request objects
|
||||||
|
"""
|
||||||
sel = Selector(response)
|
sel = Selector(response)
|
||||||
requests = []
|
requests = []
|
||||||
requests_synonyms = self.parse_synonyms(sel)
|
requests_synonyms = self.parse_synonyms(sel)
|
||||||
@ -47,10 +57,26 @@ class ChemSpider(Source):
|
|||||||
return requests
|
return requests
|
||||||
|
|
||||||
def parse_properties(self, sel):
|
def parse_properties(self, sel):
|
||||||
"""scrape Experimental Data and Predicted ACD/Labs tabs"""
|
"""
|
||||||
|
This function scrapes the Experimental Data and Predicted ACD/Labs tabs
|
||||||
|
:param sel: a Selector object of the whole page
|
||||||
|
:return: a list of Result items
|
||||||
|
"""
|
||||||
|
properties = []
|
||||||
|
|
||||||
|
properties.extend(self.parse_acdlabstab(sel))
|
||||||
|
properties.extend(self.parse_experimentaldatatab(sel))
|
||||||
|
|
||||||
|
return properties
|
||||||
|
|
||||||
|
def parse_acdlabstab(self, sel):
|
||||||
|
"""
|
||||||
|
This function scrapes the 'Predicted ACD/Labs tab' under Properties
|
||||||
|
:param sel: a Selector object of the whole page
|
||||||
|
:return: a list of Request objects
|
||||||
|
"""
|
||||||
properties = []
|
properties = []
|
||||||
|
|
||||||
# Predicted - ACD/Labs tab
|
|
||||||
td_list = sel.xpath('.//table[@id="acdlabs-table"]//td').xpath(
|
td_list = sel.xpath('.//table[@id="acdlabs-table"]//td').xpath(
|
||||||
'normalize-space(string())')
|
'normalize-space(string())')
|
||||||
prop_names = td_list[::2]
|
prop_names = td_list[::2]
|
||||||
@ -62,16 +88,15 @@ class ChemSpider(Source):
|
|||||||
prop_conditions = ''
|
prop_conditions = ''
|
||||||
|
|
||||||
# Test for properties without values, with one hardcoded exception
|
# Test for properties without values, with one hardcoded exception
|
||||||
if not re.match(r'^\d', prop_value) or (prop_name == 'Polarizability' and prop_value == '10-24cm3'):
|
if (not re.match(r'^\d', prop_value) or
|
||||||
|
(prop_name == 'Polarizability' and prop_value == '10-24cm3')):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Match for condition in parentheses
|
|
||||||
m = re.match(r'(.*) \((.*)\)', prop_name)
|
m = re.match(r'(.*) \((.*)\)', prop_name)
|
||||||
if m:
|
if m:
|
||||||
prop_name = m.group(1)
|
prop_name = m.group(1)
|
||||||
prop_conditions = m.group(2)
|
prop_conditions = m.group(2)
|
||||||
|
|
||||||
# Match for condition in value seperated by an 'at'
|
|
||||||
m = re.match(r'(.*) at (.*)', prop_value)
|
m = re.match(r'(.*) at (.*)', prop_value)
|
||||||
if m:
|
if m:
|
||||||
prop_value = m.group(1)
|
prop_value = m.group(1)
|
||||||
@ -84,11 +109,18 @@ class ChemSpider(Source):
|
|||||||
conditions=prop_conditions
|
conditions=prop_conditions
|
||||||
)
|
)
|
||||||
properties.append(new_prop)
|
properties.append(new_prop)
|
||||||
log.msg('CS prop: |%s| |%s| |%s|' %
|
|
||||||
(new_prop['attribute'], new_prop['value'], new_prop['source']),
|
|
||||||
level=log.DEBUG)
|
|
||||||
|
|
||||||
# Experimental Data Tab, Physico-chemical properties in particular
|
return properties
|
||||||
|
|
||||||
|
def parse_experimentaldatatab(self, sel):
|
||||||
|
"""
|
||||||
|
This function scrapes Experimental Data tab, Physico-chemical
|
||||||
|
properties in particular.
|
||||||
|
:param sel: a Selector object of the whole page
|
||||||
|
:return: a list of Result items
|
||||||
|
"""
|
||||||
|
properties = []
|
||||||
|
|
||||||
scraped_list = sel.xpath('.//li[span="Experimental Physico-chemical '
|
scraped_list = sel.xpath('.//li[span="Experimental Physico-chemical '
|
||||||
'Properties"]//li/table/tr/td')
|
'Properties"]//li/table/tr/td')
|
||||||
if not scraped_list:
|
if not scraped_list:
|
||||||
@ -106,14 +138,15 @@ class ChemSpider(Source):
|
|||||||
source=line.xpath('strong/text()').extract()[0].rstrip(),
|
source=line.xpath('strong/text()').extract()[0].rstrip(),
|
||||||
)
|
)
|
||||||
properties.append(new_prop)
|
properties.append(new_prop)
|
||||||
log.msg('CS prop: |%s| |%s| |%s|' %
|
|
||||||
(new_prop['attribute'], new_prop['value'],
|
|
||||||
new_prop['source']), level=log.DEBUG)
|
|
||||||
|
|
||||||
return properties
|
return properties
|
||||||
|
|
||||||
def parse_synonyms(self, sel):
|
def parse_synonyms(self, sel):
|
||||||
"""Scrape list of Names and Identifiers"""
|
"""
|
||||||
|
This function scrapes the list of Names and Identifiers
|
||||||
|
:param sel: a Selector object of the whole page
|
||||||
|
:return: a list of Requests
|
||||||
|
"""
|
||||||
requests = []
|
requests = []
|
||||||
synonyms = []
|
synonyms = []
|
||||||
|
|
||||||
@ -145,7 +178,13 @@ class ChemSpider(Source):
|
|||||||
return requests
|
return requests
|
||||||
|
|
||||||
def new_synonym(self, sel, name, category):
|
def new_synonym(self, sel, name, category):
|
||||||
"""Scrape for a single synonym at a given HTML tag"""
|
"""
|
||||||
|
This function scrapes for a single synonym at a given HTML tag
|
||||||
|
:param sel: a Selector object of the given HTML tag
|
||||||
|
:param name: the name of the synonym in the tag
|
||||||
|
:param category: the name of the category the synonym is labeled as
|
||||||
|
:return: a dictionary containing data on the synonym
|
||||||
|
"""
|
||||||
self.ignore_list.append(name)
|
self.ignore_list.append(name)
|
||||||
language = sel.xpath('span[@class="synonym_language"]/text()')
|
language = sel.xpath('span[@class="synonym_language"]/text()')
|
||||||
if language:
|
if language:
|
||||||
@ -181,7 +220,12 @@ class ChemSpider(Source):
|
|||||||
return synonym
|
return synonym
|
||||||
|
|
||||||
def parse_extendedinfo(self, response):
|
def parse_extendedinfo(self, response):
|
||||||
"""Scrape data from the ChemSpider GetExtendedCompoundInfo API"""
|
"""
|
||||||
|
This function scrapes data from the ChemSpider GetExtendedCompoundInfo
|
||||||
|
API, if a token is present in the configuration settings
|
||||||
|
:param response: a Response object to be parsed
|
||||||
|
:return: a list of Result items
|
||||||
|
"""
|
||||||
sel = Selector(response)
|
sel = Selector(response)
|
||||||
properties = []
|
properties = []
|
||||||
names = sel.xpath('*').xpath('name()').extract()
|
names = sel.xpath('*').xpath('name()').extract()
|
||||||
@ -197,8 +241,16 @@ class ChemSpider(Source):
|
|||||||
return properties
|
return properties
|
||||||
|
|
||||||
def newresult(self, attribute, value, conditions='', source='ChemSpider'):
|
def newresult(self, attribute, value, conditions='', source='ChemSpider'):
|
||||||
return Result(
|
"""
|
||||||
{
|
This function abstracts from the Result item and provides default
|
||||||
|
values.
|
||||||
|
:param attribute: the name of the attribute
|
||||||
|
:param value: the value of the attribute
|
||||||
|
:param conditions: optional conditions regarding the value
|
||||||
|
:param source: the name of the source if it is not ChemSpider
|
||||||
|
:return: A Result item
|
||||||
|
"""
|
||||||
|
return Result({
|
||||||
'attribute': attribute,
|
'attribute': attribute,
|
||||||
'value': value,
|
'value': value,
|
||||||
'source': source,
|
'source': source,
|
||||||
@ -207,7 +259,13 @@ class ChemSpider(Source):
|
|||||||
})
|
})
|
||||||
|
|
||||||
def parse_searchrequest(self, response):
|
def parse_searchrequest(self, response):
|
||||||
"""Parse the initial response of the ChemSpider Search API """
|
"""
|
||||||
|
This function parses the initial response of the ChemSpider Search API
|
||||||
|
Requires a valid token to function.
|
||||||
|
:param response: the Response object to be parsed
|
||||||
|
:return: A Request for the information page and a Request for the
|
||||||
|
extendedinfo API call
|
||||||
|
"""
|
||||||
sel = Selector(response)
|
sel = Selector(response)
|
||||||
log.msg('chemspider parse_searchrequest', level=log.DEBUG)
|
log.msg('chemspider parse_searchrequest', level=log.DEBUG)
|
||||||
sel.register_namespace('cs', 'http://www.chemspider.com/')
|
sel.register_namespace('cs', 'http://www.chemspider.com/')
|
||||||
@ -219,8 +277,8 @@ class ChemSpider(Source):
|
|||||||
log.msg('ChemSpider found multiple substances, taking first '
|
log.msg('ChemSpider found multiple substances, taking first '
|
||||||
'element', level=log.DEBUG)
|
'element', level=log.DEBUG)
|
||||||
csid = csids[0]
|
csid = csids[0]
|
||||||
structure_url = self.website[:-1] + self.structure % csid
|
structure_url = self.website[:-2].replace("\\", "") + self.structure % csid
|
||||||
extendedinfo_url = self.website[:-1] + self.extendedinfo % csid
|
extendedinfo_url = self.website[:-2].replace("\\", "") + self.extendedinfo % csid
|
||||||
log.msg('chemspider URL: %s' % structure_url, level=log.DEBUG)
|
log.msg('chemspider URL: %s' % structure_url, level=log.DEBUG)
|
||||||
return [Request(url=structure_url,
|
return [Request(url=structure_url,
|
||||||
callback=self.parse),
|
callback=self.parse),
|
||||||
@ -228,8 +286,13 @@ class ChemSpider(Source):
|
|||||||
callback=self.parse_extendedinfo)]
|
callback=self.parse_extendedinfo)]
|
||||||
|
|
||||||
def new_compound_request(self, compound):
|
def new_compound_request(self, compound):
|
||||||
|
"""
|
||||||
|
This function is called when a new synonym is returned to the spider
|
||||||
|
to generate new requests
|
||||||
|
:param compound: the name of the compound to search for
|
||||||
|
"""
|
||||||
if compound in self.ignore_list or self.cfg['token'] == '':
|
if compound in self.ignore_list or self.cfg['token'] == '':
|
||||||
return None
|
return None
|
||||||
searchurl = self.website[:-1] + self.search % compound
|
searchurl = self.website[:-2].replace("\\", "") + self.search % compound
|
||||||
log.msg('chemspider compound', level=log.DEBUG)
|
log.msg('chemspider compound', level=log.DEBUG)
|
||||||
return Request(url=searchurl, callback=self.parse_searchrequest)
|
return Request(url=searchurl, callback=self.parse_searchrequest)
|
||||||
|
@ -13,20 +13,31 @@ from FourmiCrawler.items import Result
|
|||||||
# Result item, but should be included eventually.
|
# Result item, but should be included eventually.
|
||||||
|
|
||||||
class NIST(Source):
|
class NIST(Source):
|
||||||
"""NIST Scraper plugin
|
"""
|
||||||
|
NIST Scraper plugin
|
||||||
This plugin manages searching for a chemical on the NIST website
|
This plugin manages searching for a chemical on the NIST website
|
||||||
and parsing the resulting page if the chemical exists on NIST.
|
and parsing the resulting page if the chemical exists on NIST.
|
||||||
"""
|
"""
|
||||||
website = "http://webbook.nist.gov/*"
|
website = "http://webbook\\.nist\\.gov/.*"
|
||||||
|
|
||||||
search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on'
|
search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on'
|
||||||
|
|
||||||
def __init__(self, config=None):
|
def __init__(self, config=None):
|
||||||
|
"""
|
||||||
|
Initialization of NIST scraper
|
||||||
|
:param config: configuration variables for this scraper, must contain
|
||||||
|
'reliability' key.
|
||||||
|
"""
|
||||||
Source.__init__(self, config)
|
Source.__init__(self, config)
|
||||||
self.ignore_list = set()
|
self.ignore_list = set()
|
||||||
|
|
||||||
def parse(self, response):
|
def parse(self, response):
|
||||||
|
"""
|
||||||
|
This function is called when a Response matching the variable
|
||||||
|
'website' is available for parsing the Response object.
|
||||||
|
:param response: The Scrapy Response object to be parsed
|
||||||
|
:return: a list of Result items and Request objects
|
||||||
|
"""
|
||||||
sel = Selector(response)
|
sel = Selector(response)
|
||||||
|
|
||||||
title = sel.xpath('head/title/text()').extract()[0]
|
title = sel.xpath('head/title/text()').extract()[0]
|
||||||
@ -51,6 +62,21 @@ class NIST(Source):
|
|||||||
log.msg('NIST symbol: |%s|, name: |%s|' % (symbol, name),
|
log.msg('NIST symbol: |%s|, name: |%s|' % (symbol, name),
|
||||||
level=log.DEBUG)
|
level=log.DEBUG)
|
||||||
|
|
||||||
|
requests.extend(self.parse_tables(sel, symbol_table))
|
||||||
|
|
||||||
|
return requests
|
||||||
|
|
||||||
|
def parse_tables(self, sel, symbol_table):
|
||||||
|
"""
|
||||||
|
This function identifies and distributes parsing of tables to other
|
||||||
|
functions below.
|
||||||
|
:param sel: A Selector object of the whole page
|
||||||
|
:param symbol_table: a dictionary containing translations of raw HTML
|
||||||
|
tags to human readable names
|
||||||
|
:return: a list of Result items and Requests
|
||||||
|
"""
|
||||||
|
requests = []
|
||||||
|
|
||||||
for table in sel.xpath('//table[@class="data"]'):
|
for table in sel.xpath('//table[@class="data"]'):
|
||||||
summary = table.xpath('@summary').extract()[0]
|
summary = table.xpath('@summary').extract()[0]
|
||||||
if summary == 'One dimensional data':
|
if summary == 'One dimensional data':
|
||||||
@ -81,8 +107,12 @@ class NIST(Source):
|
|||||||
return requests
|
return requests
|
||||||
|
|
||||||
def parse_generic_info(self, sel):
|
def parse_generic_info(self, sel):
|
||||||
"""Parses: synonyms, chemical formula, molecular weight, InChI,
|
"""
|
||||||
InChiKey, CAS number
|
This function parses: synonyms, chemical formula, molecular weight,
|
||||||
|
InChI, InChiKey, CAS number
|
||||||
|
:param sel: A Selector object of the entire page in the original
|
||||||
|
response
|
||||||
|
:return: a list of Result items
|
||||||
"""
|
"""
|
||||||
ul = sel.xpath('body/ul[li/strong="IUPAC Standard InChI:"]')
|
ul = sel.xpath('body/ul[li/strong="IUPAC Standard InChI:"]')
|
||||||
|
|
||||||
@ -121,15 +151,20 @@ class NIST(Source):
|
|||||||
return requests
|
return requests
|
||||||
|
|
||||||
def parse_aggregate_data(self, table, symbol_table):
|
def parse_aggregate_data(self, table, symbol_table):
|
||||||
"""Parses the table(s) which contain possible links to individual
|
"""
|
||||||
data points
|
This function parses the table(s) which contain possible links to
|
||||||
|
individual data points
|
||||||
|
:param table: a Selector object of the table to be parsed
|
||||||
|
:param symbol_table: a dictionary containing translations of raw HTML
|
||||||
|
tags to human readable names
|
||||||
|
:return: a list of Result items and Request objects
|
||||||
"""
|
"""
|
||||||
results = []
|
results = []
|
||||||
for tr in table.xpath('tr[td]'):
|
for tr in table.xpath('tr[td]'):
|
||||||
extra_data_url = tr.xpath('td[last()][a="Individual data points"]'
|
extra_data_url = tr.xpath('td[last()][a="Individual data points"]'
|
||||||
'/a/@href').extract()
|
'/a/@href').extract()
|
||||||
if extra_data_url:
|
if extra_data_url:
|
||||||
request = Request(url=self.website[:-1] + extra_data_url[0],
|
request = Request(url=self.website[:-2].replace("\\", "") + extra_data_url[0],
|
||||||
callback=self.parse_individual_datapoints)
|
callback=self.parse_individual_datapoints)
|
||||||
results.append(request)
|
results.append(request)
|
||||||
continue
|
continue
|
||||||
@ -155,14 +190,16 @@ class NIST(Source):
|
|||||||
return results
|
return results
|
||||||
|
|
||||||
def parse_transition_data(self, table, summary):
|
def parse_transition_data(self, table, summary):
|
||||||
"""Parses the table containing properties regarding phase changes"""
|
"""
|
||||||
|
This function parses the table containing properties regarding phase
|
||||||
|
changes
|
||||||
|
:param table: a Selector object of the table to be parsed
|
||||||
|
:param summary: the name of the property
|
||||||
|
:return: a list of Result items
|
||||||
|
"""
|
||||||
results = []
|
results = []
|
||||||
|
|
||||||
tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract())
|
unit = self.get_unit(table)
|
||||||
m = re.search(r'\((.*)\)', tr_unit)
|
|
||||||
unit = '!'
|
|
||||||
if m:
|
|
||||||
unit = m.group(1)
|
|
||||||
|
|
||||||
for tr in table.xpath('tr[td]'):
|
for tr in table.xpath('tr[td]'):
|
||||||
tds = tr.xpath('td/text()').extract()
|
tds = tr.xpath('td/text()').extract()
|
||||||
@ -176,18 +213,18 @@ class NIST(Source):
|
|||||||
return results
|
return results
|
||||||
|
|
||||||
def parse_generic_data(self, table, summary):
|
def parse_generic_data(self, table, summary):
|
||||||
"""Parses the common tables of 4 and 5 rows. Assumes they are of the
|
"""
|
||||||
|
Parses the common tables of 4 and 5 rows. Assumes they are of the
|
||||||
form:
|
form:
|
||||||
Symbol (unit)|Temperature (K)|Method|Reference|Comment
|
Symbol (unit)|Temperature (K)|Method|Reference|Comment
|
||||||
Symbol (unit)|Temperature (K)|Reference|Comment
|
Symbol (unit)|Temperature (K)|Reference|Comment
|
||||||
|
:param table: a Selector object of the table to be parsed
|
||||||
|
:param summary: the name of the property
|
||||||
|
:return: a list of Result items
|
||||||
"""
|
"""
|
||||||
results = []
|
results = []
|
||||||
|
|
||||||
tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract())
|
unit = self.get_unit(table)
|
||||||
m = re.search(r'\((.*)\)', tr_unit)
|
|
||||||
unit = '!'
|
|
||||||
if m:
|
|
||||||
unit = m.group(1)
|
|
||||||
|
|
||||||
for tr in table.xpath('tr[td]'):
|
for tr in table.xpath('tr[td]'):
|
||||||
tds = tr.xpath('td/text()').extract()
|
tds = tr.xpath('td/text()').extract()
|
||||||
@ -200,7 +237,13 @@ class NIST(Source):
|
|||||||
return results
|
return results
|
||||||
|
|
||||||
def parse_antoine_data(self, table, summary):
|
def parse_antoine_data(self, table, summary):
|
||||||
"""Parse table containing parameters for the Antione equation"""
|
"""
|
||||||
|
This function parses the table containing parameters for the Antione
|
||||||
|
equation
|
||||||
|
:param table: a Selector object of the table to be parsed
|
||||||
|
:param summary: the name of the property
|
||||||
|
:return: a list of Result items
|
||||||
|
"""
|
||||||
results = []
|
results = []
|
||||||
|
|
||||||
for tr in table.xpath('tr[td]'):
|
for tr in table.xpath('tr[td]'):
|
||||||
@ -215,7 +258,12 @@ class NIST(Source):
|
|||||||
return results
|
return results
|
||||||
|
|
||||||
def parse_individual_datapoints(self, response):
|
def parse_individual_datapoints(self, response):
|
||||||
"""Parses the page linked from aggregate data"""
|
"""
|
||||||
|
This function parses the 'individual data points' page linked from
|
||||||
|
the aggregate data table(s)
|
||||||
|
:param response: the Scrapy Response object to be parsed
|
||||||
|
:return: a list of Result items
|
||||||
|
"""
|
||||||
sel = Selector(response)
|
sel = Selector(response)
|
||||||
table = sel.xpath('//table[@class="data"]')[0]
|
table = sel.xpath('//table[@class="data"]')[0]
|
||||||
|
|
||||||
@ -228,11 +276,7 @@ class NIST(Source):
|
|||||||
name = m.group(1)
|
name = m.group(1)
|
||||||
condition = m.group(2)
|
condition = m.group(2)
|
||||||
|
|
||||||
tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract())
|
unit = self.get_unit(table)
|
||||||
m = re.search(r'\((.*)\)', tr_unit)
|
|
||||||
unit = '!'
|
|
||||||
if m:
|
|
||||||
unit = m.group(1)
|
|
||||||
|
|
||||||
for tr in table.xpath('tr[td]'):
|
for tr in table.xpath('tr[td]'):
|
||||||
tds = tr.xpath('td/text()').extract()
|
tds = tr.xpath('td/text()').extract()
|
||||||
@ -250,7 +294,25 @@ class NIST(Source):
|
|||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_unit(table):
|
||||||
|
tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract())
|
||||||
|
m = re.search(r'\((.*)\)', tr_unit)
|
||||||
|
unit = '!'
|
||||||
|
if m:
|
||||||
|
unit = m.group(1)
|
||||||
|
|
||||||
|
return unit
|
||||||
|
|
||||||
def newresult(self, attribute, value, conditions=''):
|
def newresult(self, attribute, value, conditions=''):
|
||||||
|
"""
|
||||||
|
This function abstracts from the Result item and provides default
|
||||||
|
values
|
||||||
|
:param attribute: the name of the attribute
|
||||||
|
:param value: the value of the attribute
|
||||||
|
:param conditions: optional conditions regarding the value
|
||||||
|
:return: A Result item
|
||||||
|
"""
|
||||||
return Result(
|
return Result(
|
||||||
{
|
{
|
||||||
'attribute': attribute,
|
'attribute': attribute,
|
||||||
@ -261,7 +323,12 @@ class NIST(Source):
|
|||||||
})
|
})
|
||||||
|
|
||||||
def new_compound_request(self, compound):
|
def new_compound_request(self, compound):
|
||||||
|
"""
|
||||||
|
This function is called when a new synonym is returned to the spider
|
||||||
|
to generate new requests
|
||||||
|
:param compound: the name of the compound to search for
|
||||||
|
"""
|
||||||
if compound not in self.ignore_list:
|
if compound not in self.ignore_list:
|
||||||
self.ignore_list.update(compound)
|
self.ignore_list.update(compound)
|
||||||
return Request(url=self.website[:-1] + self.search % compound,
|
return Request(url=self.website[:-2].replace("\\", "") + self.search % compound,
|
||||||
callback=self.parse)
|
callback=self.parse)
|
||||||
|
@ -1,9 +1,11 @@
|
|||||||
|
import re
|
||||||
|
|
||||||
from scrapy.http import Request
|
from scrapy.http import Request
|
||||||
from scrapy import log
|
from scrapy import log
|
||||||
from source import Source
|
|
||||||
from scrapy.selector import Selector
|
from scrapy.selector import Selector
|
||||||
|
|
||||||
|
from source import Source
|
||||||
from FourmiCrawler.items import Result
|
from FourmiCrawler.items import Result
|
||||||
import re
|
|
||||||
|
|
||||||
|
|
||||||
class PubChem(Source):
|
class PubChem(Source):
|
||||||
@ -13,10 +15,10 @@ class PubChem(Source):
|
|||||||
including sources of the values of properties.
|
including sources of the values of properties.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
#PubChem has its data on compound name, properties and their values on different html pages, so different URLs used
|
# PubChem has its data on compound name, properties and their values on different html pages, so different URLs used
|
||||||
website = 'https://*.ncbi.nlm.nih.gov/*'
|
website = 'http://.*\\.ncbi\\.nlm\\.nih\\.gov/.*'
|
||||||
website_www = 'https://www.ncbi.nlm.nih.gov/*'
|
website_www = 'http://www.ncbi.nlm.nih.gov/*'
|
||||||
website_pubchem = 'https://pubchem.ncbi.nlm.nih.gov/*'
|
website_pubchem = 'http://pubchem.ncbi.nlm.nih.gov/.*'
|
||||||
search = 'pccompound?term=%s'
|
search = 'pccompound?term=%s'
|
||||||
data_url = 'toc/summary_toc.cgi?tocid=27&cid=%s'
|
data_url = 'toc/summary_toc.cgi?tocid=27&cid=%s'
|
||||||
|
|
||||||
@ -49,17 +51,19 @@ class PubChem(Source):
|
|||||||
self._spider.get_synonym_requests(synonym)
|
self._spider.get_synonym_requests(synonym)
|
||||||
log.msg('Raw synonyms found: %s' % raw_synonyms, level=log.DEBUG)
|
log.msg('Raw synonyms found: %s' % raw_synonyms, level=log.DEBUG)
|
||||||
|
|
||||||
n = re.search(r'cid=(\d+)',response.url)
|
n = re.search(r'cid=(\d+)', response.url)
|
||||||
if n:
|
if n:
|
||||||
cid = n.group(1)
|
cid = n.group(1)
|
||||||
log.msg('cid: %s' % cid, level=log.DEBUG) #getting the right id of the compound with which it can reach
|
log.msg('cid: %s' % cid, level=log.DEBUG) # getting the right id of the compound with which it can reach
|
||||||
# the seperate html page which contains the properties and their values
|
# the seperate html page which contains the properties and their values
|
||||||
|
|
||||||
#using this cid to get the right url and scrape it
|
# using this cid to get the right url and scrape it
|
||||||
requests.append(Request(url=self.website_pubchem[:-1] + self.data_url % cid, callback=self.parse_data))
|
requests.append(
|
||||||
|
Request(url=self.website_pubchem[:-2].replace("\\", "") + self.data_url % cid, callback=self.parse_data))
|
||||||
return requests
|
return requests
|
||||||
|
|
||||||
def parse_data(self, response):
|
@staticmethod
|
||||||
|
def parse_data(response):
|
||||||
"""
|
"""
|
||||||
Parse data found in 'Chemical and Physical properties' part of a substance page.
|
Parse data found in 'Chemical and Physical properties' part of a substance page.
|
||||||
:param response: The response with the page to parse
|
:param response: The response with the page to parse
|
||||||
@ -80,7 +84,7 @@ class PubChem(Source):
|
|||||||
'attribute': prop_name,
|
'attribute': prop_name,
|
||||||
'value': prop_value,
|
'value': prop_value,
|
||||||
'source': prop_source,
|
'source': prop_source,
|
||||||
'reliability': 'Unknown',
|
'reliability': self.cfg['reliability'],
|
||||||
'conditions': ''
|
'conditions': ''
|
||||||
})
|
})
|
||||||
log.msg('PubChem prop: |%s| |%s| |%s|' %
|
log.msg('PubChem prop: |%s| |%s| |%s|' %
|
||||||
@ -96,7 +100,7 @@ class PubChem(Source):
|
|||||||
'attribute': prop_name,
|
'attribute': prop_name,
|
||||||
'value': prop_value,
|
'value': prop_value,
|
||||||
'source': prop_source,
|
'source': prop_source,
|
||||||
'reliability': 'Unknown',
|
'reliability': self.cfg['reliability'],
|
||||||
'conditions': ''
|
'conditions': ''
|
||||||
})
|
})
|
||||||
log.msg('PubChem prop: |%s| |%s| |%s|' %
|
log.msg('PubChem prop: |%s| |%s| |%s|' %
|
||||||
@ -106,6 +110,41 @@ class PubChem(Source):
|
|||||||
|
|
||||||
return requests
|
return requests
|
||||||
|
|
||||||
|
def parse_searchrequest(self, response):
|
||||||
|
"""
|
||||||
|
This function parses the response to the new_compound_request Request
|
||||||
|
:param response: the Response object to be parsed
|
||||||
|
:return: A Request for the compound page or what self.parse returns in
|
||||||
|
case the search request forwarded to the compound page
|
||||||
|
"""
|
||||||
|
|
||||||
|
# check if pubchem forwarded straight to compound page
|
||||||
|
m = re.match(self.website_pubchem, response.url)
|
||||||
|
if m:
|
||||||
|
log.msg('PubChem search forwarded to compound page',
|
||||||
|
level=log.DEBUG)
|
||||||
|
return self.parse(response)
|
||||||
|
|
||||||
|
sel = Selector(response)
|
||||||
|
|
||||||
|
results = sel.xpath('//div[@class="rsltcont"]')
|
||||||
|
if results:
|
||||||
|
url = results[0].xpath('div/p/a[1]/@href')
|
||||||
|
else:
|
||||||
|
log.msg('PubChem search found nothing or xpath failed',
|
||||||
|
level=log.DEBUG)
|
||||||
|
return None
|
||||||
|
|
||||||
|
if url:
|
||||||
|
url = 'http:' + ''.join(url[0].extract())
|
||||||
|
log.msg('PubChem compound page: %s' % url, level=log.DEBUG)
|
||||||
|
else:
|
||||||
|
log.msg('PubChem search found results, but no url in first result',
|
||||||
|
level=log.DEBUG)
|
||||||
|
return None
|
||||||
|
|
||||||
|
return Request(url=url, callback=self.parse)
|
||||||
|
|
||||||
def new_compound_request(self, compound):
|
def new_compound_request(self, compound):
|
||||||
return Request(url=self.website_www[:-1] + self.search % compound, callback=self.parse)
|
return Request(url=self.website_www[:-1] + self.search % compound,
|
||||||
|
callback=self.parse_searchrequest)
|
||||||
|
@ -15,7 +15,7 @@ class WikipediaParser(Source):
|
|||||||
It also returns requests with other external sources which contain information on parsed subject.
|
It also returns requests with other external sources which contain information on parsed subject.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
website = "http://en.wikipedia.org/wiki/*"
|
website = "http://en\\.wikipedia\\.org/wiki/.*"
|
||||||
__spider = None
|
__spider = None
|
||||||
searched_compounds = []
|
searched_compounds = []
|
||||||
|
|
||||||
@ -123,7 +123,7 @@ class WikipediaParser(Source):
|
|||||||
return items
|
return items
|
||||||
|
|
||||||
def new_compound_request(self, compound):
|
def new_compound_request(self, compound):
|
||||||
return Request(url=self.website[:-1] + compound, callback=self.parse)
|
return Request(url=self.website[:-2].replace("\\", "") + compound, callback=self.parse)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def clean_items(items):
|
def clean_items(items):
|
||||||
|
@ -3,7 +3,7 @@ from scrapy import log
|
|||||||
|
|
||||||
|
|
||||||
class Source:
|
class Source:
|
||||||
website = "http://something/*" # Regex of URI's the source is able to parse
|
website = "http://something/.*" # Regex of URI's the source is able to parse
|
||||||
_spider = None
|
_spider = None
|
||||||
|
|
||||||
def __init__(self, config=None):
|
def __init__(self, config=None):
|
||||||
@ -30,7 +30,7 @@ class Source:
|
|||||||
:param compound: A compound name.
|
:param compound: A compound name.
|
||||||
:return: A new Scrapy Request
|
:return: A new Scrapy Request
|
||||||
"""
|
"""
|
||||||
# return Request(url=self.website[:-1] + compound, callback=self.parse)
|
# return Request(url=self.website[:-2].replace("\\", "") + compound, callback=self.parse)
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def set_spider(self, spider):
|
def set_spider(self, spider):
|
||||||
|
@ -34,8 +34,9 @@ class FourmiSpider(Spider):
|
|||||||
"""
|
"""
|
||||||
for source in self._sources:
|
for source in self._sources:
|
||||||
if re.match(source.website, response.url):
|
if re.match(source.website, response.url):
|
||||||
log.msg("Url: " + response.url + " -> Source: " + source.website, level=log.DEBUG)
|
log.msg("URL: " + response.url + " -> Source: " + source.website, level=log.DEBUG)
|
||||||
return source.parse(response)
|
return source.parse(response)
|
||||||
|
log.msg("URL: " + response.url + " -> No compatible source", level=log.INFO)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def get_synonym_requests(self, compound, force=False):
|
def get_synonym_requests(self, compound, force=False):
|
||||||
|
@ -48,7 +48,6 @@ __Main goals:__
|
|||||||
- Build an graphical user interface(GUI) as alternative for the command line
|
- Build an graphical user interface(GUI) as alternative for the command line
|
||||||
interface(CLI). (Assignee: Harmen)
|
interface(CLI). (Assignee: Harmen)
|
||||||
- Compiling the source into an windows executable. (Assignee: Bas)
|
- Compiling the source into an windows executable. (Assignee: Bas)
|
||||||
- Create an module to gather data from PubChem. (Assignee: Nout)
|
|
||||||
|
|
||||||
__Side goals:__
|
__Side goals:__
|
||||||
|
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
"""
|
"""
|
||||||
Fourmi, a web scraper build to search specific information for a given compound (and it's pseudonyms).
|
Fourmi, a web scraper build to search specific information for a given compound (and its pseudonyms).
|
||||||
|
|
||||||
Usage:
|
Usage:
|
||||||
fourmi
|
fourmi
|
||||||
@ -18,7 +18,7 @@ Options:
|
|||||||
--version Show version.
|
--version Show version.
|
||||||
-v Verbose logging output. (Multiple occurrences increase logging level)
|
-v Verbose logging output. (Multiple occurrences increase logging level)
|
||||||
--log=<file> Save log to an file.
|
--log=<file> Save log to an file.
|
||||||
-o <file> --output=<file> Output file [default: results.*format*]
|
-o <file> --output=<file> Output file [default: <compound>.*format*]
|
||||||
-f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: csv]
|
-f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: csv]
|
||||||
--include=<regex> Include only sources that match these regular expressions split by a comma.
|
--include=<regex> Include only sources that match these regular expressions split by a comma.
|
||||||
--exclude=<regex> Exclude the sources that match these regular expressions split by a comma.
|
--exclude=<regex> Exclude the sources that match these regular expressions split by a comma.
|
||||||
@ -61,7 +61,7 @@ def search(docopt_arguments, source_loader):
|
|||||||
"""
|
"""
|
||||||
conf = Configurator()
|
conf = Configurator()
|
||||||
conf.set_logging(docopt_arguments["--log"], docopt_arguments["-v"])
|
conf.set_logging(docopt_arguments["--log"], docopt_arguments["-v"])
|
||||||
conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"])
|
conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"], docopt_arguments["<compound>"])
|
||||||
setup_crawler(docopt_arguments["<compound>"], conf.scrapy_settings,
|
setup_crawler(docopt_arguments["<compound>"], conf.scrapy_settings,
|
||||||
source_loader, docopt_arguments["--attributes"].split(','))
|
source_loader, docopt_arguments["--attributes"].split(','))
|
||||||
if conf.scrapy_settings.getbool("LOG_ENABLED"):
|
if conf.scrapy_settings.getbool("LOG_ENABLED"):
|
||||||
|
19
sources.cfg.sample
Normal file
19
sources.cfg.sample
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
[DEFAULT]
|
||||||
|
reliability = Unknown
|
||||||
|
|
||||||
|
#For each source listed in FourmiCrawler/sources there should be a section
|
||||||
|
#named exactly as the filename in here. If not present, the DEFAULT value is
|
||||||
|
#used for reliability of that source.
|
||||||
|
|
||||||
|
[ChemSpider]
|
||||||
|
reliability = High
|
||||||
|
#token=Paste ChemSpider API token here and remove the hashtag
|
||||||
|
|
||||||
|
[NIST]
|
||||||
|
reliability = High
|
||||||
|
|
||||||
|
[WikipediaParser]
|
||||||
|
reliability = Medium
|
||||||
|
|
||||||
|
[PubChem]
|
||||||
|
reliability = High
|
@ -10,16 +10,16 @@ class TestConfigurator(unittest.TestCase):
|
|||||||
self.conf = Configurator()
|
self.conf = Configurator()
|
||||||
|
|
||||||
def test_set_output(self):
|
def test_set_output(self):
|
||||||
self.conf.set_output(filename="test.txt", fileformat="csv")
|
self.conf.set_output(filename="test.txt", fileformat="csv", compound="test")
|
||||||
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.txt")
|
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.txt")
|
||||||
self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv")
|
self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv")
|
||||||
|
|
||||||
self.conf.set_output("results.*format*", "jsonlines")
|
self.conf.set_output("<compound>.*format*", "jsonlines", "test")
|
||||||
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.json")
|
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.json")
|
||||||
self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "jsonlines")
|
self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "jsonlines")
|
||||||
|
|
||||||
self.conf.set_output("results.*format*", "csv")
|
self.conf.set_output("<compound>.*format*", "csv", "test")
|
||||||
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.csv")
|
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.csv")
|
||||||
self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv")
|
self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv")
|
||||||
|
|
||||||
def test_start_log(self):
|
def test_start_log(self):
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
import ConfigParser
|
import ConfigParser
|
||||||
|
import os
|
||||||
|
|
||||||
from scrapy.utils.project import get_project_settings
|
from scrapy.utils.project import get_project_settings
|
||||||
|
|
||||||
@ -12,7 +13,7 @@ class Configurator:
|
|||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.scrapy_settings = get_project_settings()
|
self.scrapy_settings = get_project_settings()
|
||||||
|
|
||||||
def set_output(self, filename, fileformat):
|
def set_output(self, filename, fileformat, compound):
|
||||||
"""
|
"""
|
||||||
This function manipulates the Scrapy output file settings that normally would be set in the settings file.
|
This function manipulates the Scrapy output file settings that normally would be set in the settings file.
|
||||||
In the Fourmi project these are command line arguments.
|
In the Fourmi project these are command line arguments.
|
||||||
@ -20,12 +21,12 @@ class Configurator:
|
|||||||
:param fileformat: The format in which the output will be.
|
:param fileformat: The format in which the output will be.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if filename != 'results.*format*':
|
if filename != '<compound>.*format*':
|
||||||
self.scrapy_settings.overrides["FEED_URI"] = filename
|
self.scrapy_settings.overrides["FEED_URI"] = filename
|
||||||
elif fileformat == "jsonlines":
|
elif fileformat == "jsonlines":
|
||||||
self.scrapy_settings.overrides["FEED_URI"] = "results.json"
|
self.scrapy_settings.overrides["FEED_URI"] = compound + ".json"
|
||||||
elif fileformat is not None:
|
elif fileformat is not None:
|
||||||
self.scrapy_settings.overrides["FEED_URI"] = "results." + fileformat
|
self.scrapy_settings.overrides["FEED_URI"] = compound + "." + fileformat
|
||||||
|
|
||||||
if fileformat is not None:
|
if fileformat is not None:
|
||||||
self.scrapy_settings.overrides["FEED_FORMAT"] = fileformat
|
self.scrapy_settings.overrides["FEED_FORMAT"] = fileformat
|
||||||
@ -66,8 +67,11 @@ class Configurator:
|
|||||||
variables for sources
|
variables for sources
|
||||||
:return a ConfigParser object of sources.cfg
|
:return a ConfigParser object of sources.cfg
|
||||||
"""
|
"""
|
||||||
|
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
config_path = current_dir + '/../sources.cfg'
|
||||||
|
# [TODO]: location of sources.cfg should be softcoded eventually
|
||||||
config = ConfigParser.ConfigParser()
|
config = ConfigParser.ConfigParser()
|
||||||
config.read('sources.cfg') # [TODO]: should be softcoded eventually
|
config.read(config_path)
|
||||||
return config
|
return config
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
Reference in New Issue
Block a user