Merge branch 'release/v0.6.0'
This commit is contained in:
commit
50e6835116
2
.gitignore
vendored
2
.gitignore
vendored
@ -6,6 +6,8 @@
|
|||||||
|
|
||||||
#may contain authentication information
|
#may contain authentication information
|
||||||
sources.cfg
|
sources.cfg
|
||||||
|
#Another of our config files
|
||||||
|
GUI.cfg
|
||||||
|
|
||||||
#THINGS WE WOULD NEVER EVER WANT!
|
#THINGS WE WOULD NEVER EVER WANT!
|
||||||
#ignore thumbnails created by windows
|
#ignore thumbnails created by windows
|
||||||
|
@ -3,6 +3,10 @@
|
|||||||
language: python
|
language: python
|
||||||
python: 2.7
|
python: 2.7
|
||||||
|
|
||||||
|
before_install:
|
||||||
|
- "export DISPLAY=:99.0"
|
||||||
|
- "sh -e /etc/init.d/xvfb start"
|
||||||
|
|
||||||
# command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors
|
# command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors
|
||||||
install:
|
install:
|
||||||
- pip install Scrapy docopt
|
- pip install Scrapy docopt
|
||||||
@ -10,10 +14,10 @@ install:
|
|||||||
|
|
||||||
# command to run tests, e.g. python setup.py test
|
# command to run tests, e.g. python setup.py test
|
||||||
script:
|
script:
|
||||||
- nosetests --with-coverage --cover-package=FourmiCrawler,utils tests
|
- nosetests --with-coverage --cover-package=FourmiCrawler,utils,GUI tests
|
||||||
|
|
||||||
notifications:
|
notifications:
|
||||||
slack: descartes2:6sgCzx3PvrO9IIMwKxj12dDM
|
slack: descartes2:6sgCzx3PvrO9IIMwKxj12dDM
|
||||||
|
|
||||||
after_success:
|
after_success:
|
||||||
coveralls --verbose
|
coveralls --verbose
|
||||||
|
@ -1,3 +1,11 @@
|
|||||||
|
### v0.6.0
|
||||||
|
- Feature: Added a Graphical User interface
|
||||||
|
- Feature: Automatic config file createion from config samples
|
||||||
|
- FIX: The default name of the output files will now consist of the compound name and the file format when using the CLI
|
||||||
|
- FIX: A lot of bugfixes of the PubChem plugin, as is wasn't working as it should
|
||||||
|
- FIX: Using absolute path for configuration files
|
||||||
|
- DEV: General Code cleanup in documentation
|
||||||
|
|
||||||
### v0.5.3
|
### v0.5.3
|
||||||
- FIX: It is now again possible to use both verbose and the source inclusion/exclusion options
|
- FIX: It is now again possible to use both verbose and the source inclusion/exclusion options
|
||||||
- FIX: Logging is now "actually" disabled if not using the verbose option.
|
- FIX: Logging is now "actually" disabled if not using the verbose option.
|
||||||
|
@ -21,7 +21,4 @@ FEED_FORMAT = 'jsonlines'
|
|||||||
# Crawl responsibly by identifying yourself (and your website) on the
|
# Crawl responsibly by identifying yourself (and your website) on the
|
||||||
# user-agent
|
# user-agent
|
||||||
|
|
||||||
# [todo] - Check for repercussions on spoofing the user agent
|
USER_AGENT = 'Fourmi'
|
||||||
|
|
||||||
# USER_AGENT = 'FourmiCrawler (+http://www.yourdomain.com)'
|
|
||||||
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36'
|
|
||||||
|
@ -9,24 +9,28 @@ from FourmiCrawler.items import Result
|
|||||||
|
|
||||||
|
|
||||||
# [TODO] - Maybe clean up usage of '.extract()[0]', because of possible IndexError exception.
|
# [TODO] - Maybe clean up usage of '.extract()[0]', because of possible IndexError exception.
|
||||||
# [TODO] - Add checks at search request and extendedCompoundInfo on whether the token was valid or not
|
|
||||||
|
|
||||||
class ChemSpider(Source):
|
class ChemSpider(Source):
|
||||||
"""ChemSpider scraper for synonyms and properties
|
"""
|
||||||
|
ChemSpider scraper for synonyms and properties
|
||||||
This parser will manage searching for chemicals through the
|
This parser will manage searching for chemicals through the
|
||||||
ChemsSpider API, and parsing the resulting ChemSpider page.
|
ChemsSpider API, and parsing the resulting ChemSpider page.
|
||||||
The token required for the API should be in a configuration file
|
The token required for the API should be in a configuration file
|
||||||
somewhere.
|
somewhere.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
website = 'http://www.chemspider.com/*'
|
website = 'http://www\\.chemspider\\.com/.*'
|
||||||
|
|
||||||
search = 'Search.asmx/SimpleSearch?query=%s&token='
|
search = 'Search.asmx/SimpleSearch?query=%s&token='
|
||||||
structure = 'Chemical-Structure.%s.html'
|
structure = 'Chemical-Structure.%s.html'
|
||||||
extendedinfo = 'MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token='
|
extendedinfo = 'MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token='
|
||||||
|
|
||||||
def __init__(self, config=None):
|
def __init__(self, config=None):
|
||||||
|
"""
|
||||||
|
Initialization of ChemSpider scraper
|
||||||
|
:param config: a dictionary of settings for this scraper, must contain
|
||||||
|
'reliability' key
|
||||||
|
"""
|
||||||
Source.__init__(self, config)
|
Source.__init__(self, config)
|
||||||
self.ignore_list = []
|
self.ignore_list = []
|
||||||
if 'token' not in self.cfg or self.cfg['token'] == '':
|
if 'token' not in self.cfg or self.cfg['token'] == '':
|
||||||
@ -37,6 +41,12 @@ class ChemSpider(Source):
|
|||||||
self.extendedinfo += self.cfg['token']
|
self.extendedinfo += self.cfg['token']
|
||||||
|
|
||||||
def parse(self, response):
|
def parse(self, response):
|
||||||
|
"""
|
||||||
|
This function is called when a Response matching the variable
|
||||||
|
'website' is available for parsing the Response object.
|
||||||
|
:param response: the Scrapy Response object to be parsed
|
||||||
|
:return: a list of Result items and Request objects
|
||||||
|
"""
|
||||||
sel = Selector(response)
|
sel = Selector(response)
|
||||||
requests = []
|
requests = []
|
||||||
requests_synonyms = self.parse_synonyms(sel)
|
requests_synonyms = self.parse_synonyms(sel)
|
||||||
@ -47,10 +57,26 @@ class ChemSpider(Source):
|
|||||||
return requests
|
return requests
|
||||||
|
|
||||||
def parse_properties(self, sel):
|
def parse_properties(self, sel):
|
||||||
"""scrape Experimental Data and Predicted ACD/Labs tabs"""
|
"""
|
||||||
|
This function scrapes the Experimental Data and Predicted ACD/Labs tabs
|
||||||
|
:param sel: a Selector object of the whole page
|
||||||
|
:return: a list of Result items
|
||||||
|
"""
|
||||||
|
properties = []
|
||||||
|
|
||||||
|
properties.extend(self.parse_acdlabstab(sel))
|
||||||
|
properties.extend(self.parse_experimentaldatatab(sel))
|
||||||
|
|
||||||
|
return properties
|
||||||
|
|
||||||
|
def parse_acdlabstab(self, sel):
|
||||||
|
"""
|
||||||
|
This function scrapes the 'Predicted ACD/Labs tab' under Properties
|
||||||
|
:param sel: a Selector object of the whole page
|
||||||
|
:return: a list of Request objects
|
||||||
|
"""
|
||||||
properties = []
|
properties = []
|
||||||
|
|
||||||
# Predicted - ACD/Labs tab
|
|
||||||
td_list = sel.xpath('.//table[@id="acdlabs-table"]//td').xpath(
|
td_list = sel.xpath('.//table[@id="acdlabs-table"]//td').xpath(
|
||||||
'normalize-space(string())')
|
'normalize-space(string())')
|
||||||
prop_names = td_list[::2]
|
prop_names = td_list[::2]
|
||||||
@ -62,16 +88,15 @@ class ChemSpider(Source):
|
|||||||
prop_conditions = ''
|
prop_conditions = ''
|
||||||
|
|
||||||
# Test for properties without values, with one hardcoded exception
|
# Test for properties without values, with one hardcoded exception
|
||||||
if not re.match(r'^\d', prop_value) or (prop_name == 'Polarizability' and prop_value == '10-24cm3'):
|
if (not re.match(r'^\d', prop_value) or
|
||||||
|
(prop_name == 'Polarizability' and prop_value == '10-24cm3')):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Match for condition in parentheses
|
|
||||||
m = re.match(r'(.*) \((.*)\)', prop_name)
|
m = re.match(r'(.*) \((.*)\)', prop_name)
|
||||||
if m:
|
if m:
|
||||||
prop_name = m.group(1)
|
prop_name = m.group(1)
|
||||||
prop_conditions = m.group(2)
|
prop_conditions = m.group(2)
|
||||||
|
|
||||||
# Match for condition in value seperated by an 'at'
|
|
||||||
m = re.match(r'(.*) at (.*)', prop_value)
|
m = re.match(r'(.*) at (.*)', prop_value)
|
||||||
if m:
|
if m:
|
||||||
prop_value = m.group(1)
|
prop_value = m.group(1)
|
||||||
@ -84,11 +109,18 @@ class ChemSpider(Source):
|
|||||||
conditions=prop_conditions
|
conditions=prop_conditions
|
||||||
)
|
)
|
||||||
properties.append(new_prop)
|
properties.append(new_prop)
|
||||||
log.msg('CS prop: |%s| |%s| |%s|' %
|
|
||||||
(new_prop['attribute'], new_prop['value'], new_prop['source']),
|
|
||||||
level=log.DEBUG)
|
|
||||||
|
|
||||||
# Experimental Data Tab, Physico-chemical properties in particular
|
return properties
|
||||||
|
|
||||||
|
def parse_experimentaldatatab(self, sel):
|
||||||
|
"""
|
||||||
|
This function scrapes Experimental Data tab, Physico-chemical
|
||||||
|
properties in particular.
|
||||||
|
:param sel: a Selector object of the whole page
|
||||||
|
:return: a list of Result items
|
||||||
|
"""
|
||||||
|
properties = []
|
||||||
|
|
||||||
scraped_list = sel.xpath('.//li[span="Experimental Physico-chemical '
|
scraped_list = sel.xpath('.//li[span="Experimental Physico-chemical '
|
||||||
'Properties"]//li/table/tr/td')
|
'Properties"]//li/table/tr/td')
|
||||||
if not scraped_list:
|
if not scraped_list:
|
||||||
@ -105,15 +137,16 @@ class ChemSpider(Source):
|
|||||||
value=line.xpath('text()').extract()[0].rstrip(),
|
value=line.xpath('text()').extract()[0].rstrip(),
|
||||||
source=line.xpath('strong/text()').extract()[0].rstrip(),
|
source=line.xpath('strong/text()').extract()[0].rstrip(),
|
||||||
)
|
)
|
||||||
properties.append(new_prop)
|
properties.append(new_prop)
|
||||||
log.msg('CS prop: |%s| |%s| |%s|' %
|
|
||||||
(new_prop['attribute'], new_prop['value'],
|
|
||||||
new_prop['source']), level=log.DEBUG)
|
|
||||||
|
|
||||||
return properties
|
return properties
|
||||||
|
|
||||||
def parse_synonyms(self, sel):
|
def parse_synonyms(self, sel):
|
||||||
"""Scrape list of Names and Identifiers"""
|
"""
|
||||||
|
This function scrapes the list of Names and Identifiers
|
||||||
|
:param sel: a Selector object of the whole page
|
||||||
|
:return: a list of Requests
|
||||||
|
"""
|
||||||
requests = []
|
requests = []
|
||||||
synonyms = []
|
synonyms = []
|
||||||
|
|
||||||
@ -145,7 +178,13 @@ class ChemSpider(Source):
|
|||||||
return requests
|
return requests
|
||||||
|
|
||||||
def new_synonym(self, sel, name, category):
|
def new_synonym(self, sel, name, category):
|
||||||
"""Scrape for a single synonym at a given HTML tag"""
|
"""
|
||||||
|
This function scrapes for a single synonym at a given HTML tag
|
||||||
|
:param sel: a Selector object of the given HTML tag
|
||||||
|
:param name: the name of the synonym in the tag
|
||||||
|
:param category: the name of the category the synonym is labeled as
|
||||||
|
:return: a dictionary containing data on the synonym
|
||||||
|
"""
|
||||||
self.ignore_list.append(name)
|
self.ignore_list.append(name)
|
||||||
language = sel.xpath('span[@class="synonym_language"]/text()')
|
language = sel.xpath('span[@class="synonym_language"]/text()')
|
||||||
if language:
|
if language:
|
||||||
@ -181,7 +220,12 @@ class ChemSpider(Source):
|
|||||||
return synonym
|
return synonym
|
||||||
|
|
||||||
def parse_extendedinfo(self, response):
|
def parse_extendedinfo(self, response):
|
||||||
"""Scrape data from the ChemSpider GetExtendedCompoundInfo API"""
|
"""
|
||||||
|
This function scrapes data from the ChemSpider GetExtendedCompoundInfo
|
||||||
|
API, if a token is present in the configuration settings
|
||||||
|
:param response: a Response object to be parsed
|
||||||
|
:return: a list of Result items
|
||||||
|
"""
|
||||||
sel = Selector(response)
|
sel = Selector(response)
|
||||||
properties = []
|
properties = []
|
||||||
names = sel.xpath('*').xpath('name()').extract()
|
names = sel.xpath('*').xpath('name()').extract()
|
||||||
@ -197,17 +241,31 @@ class ChemSpider(Source):
|
|||||||
return properties
|
return properties
|
||||||
|
|
||||||
def newresult(self, attribute, value, conditions='', source='ChemSpider'):
|
def newresult(self, attribute, value, conditions='', source='ChemSpider'):
|
||||||
return Result(
|
"""
|
||||||
{
|
This function abstracts from the Result item and provides default
|
||||||
'attribute': attribute,
|
values.
|
||||||
'value': value,
|
:param attribute: the name of the attribute
|
||||||
'source': source,
|
:param value: the value of the attribute
|
||||||
'reliability': self.cfg['reliability'],
|
:param conditions: optional conditions regarding the value
|
||||||
'conditions': conditions
|
:param source: the name of the source if it is not ChemSpider
|
||||||
})
|
:return: A Result item
|
||||||
|
"""
|
||||||
|
return Result({
|
||||||
|
'attribute': attribute,
|
||||||
|
'value': value,
|
||||||
|
'source': source,
|
||||||
|
'reliability': self.cfg['reliability'],
|
||||||
|
'conditions': conditions
|
||||||
|
})
|
||||||
|
|
||||||
def parse_searchrequest(self, response):
|
def parse_searchrequest(self, response):
|
||||||
"""Parse the initial response of the ChemSpider Search API """
|
"""
|
||||||
|
This function parses the initial response of the ChemSpider Search API
|
||||||
|
Requires a valid token to function.
|
||||||
|
:param response: the Response object to be parsed
|
||||||
|
:return: A Request for the information page and a Request for the
|
||||||
|
extendedinfo API call
|
||||||
|
"""
|
||||||
sel = Selector(response)
|
sel = Selector(response)
|
||||||
log.msg('chemspider parse_searchrequest', level=log.DEBUG)
|
log.msg('chemspider parse_searchrequest', level=log.DEBUG)
|
||||||
sel.register_namespace('cs', 'http://www.chemspider.com/')
|
sel.register_namespace('cs', 'http://www.chemspider.com/')
|
||||||
@ -219,8 +277,8 @@ class ChemSpider(Source):
|
|||||||
log.msg('ChemSpider found multiple substances, taking first '
|
log.msg('ChemSpider found multiple substances, taking first '
|
||||||
'element', level=log.DEBUG)
|
'element', level=log.DEBUG)
|
||||||
csid = csids[0]
|
csid = csids[0]
|
||||||
structure_url = self.website[:-1] + self.structure % csid
|
structure_url = self.website[:-2].replace("\\", "") + self.structure % csid
|
||||||
extendedinfo_url = self.website[:-1] + self.extendedinfo % csid
|
extendedinfo_url = self.website[:-2].replace("\\", "") + self.extendedinfo % csid
|
||||||
log.msg('chemspider URL: %s' % structure_url, level=log.DEBUG)
|
log.msg('chemspider URL: %s' % structure_url, level=log.DEBUG)
|
||||||
return [Request(url=structure_url,
|
return [Request(url=structure_url,
|
||||||
callback=self.parse),
|
callback=self.parse),
|
||||||
@ -228,8 +286,13 @@ class ChemSpider(Source):
|
|||||||
callback=self.parse_extendedinfo)]
|
callback=self.parse_extendedinfo)]
|
||||||
|
|
||||||
def new_compound_request(self, compound):
|
def new_compound_request(self, compound):
|
||||||
|
"""
|
||||||
|
This function is called when a new synonym is returned to the spider
|
||||||
|
to generate new requests
|
||||||
|
:param compound: the name of the compound to search for
|
||||||
|
"""
|
||||||
if compound in self.ignore_list or self.cfg['token'] == '':
|
if compound in self.ignore_list or self.cfg['token'] == '':
|
||||||
return None
|
return None
|
||||||
searchurl = self.website[:-1] + self.search % compound
|
searchurl = self.website[:-2].replace("\\", "") + self.search % compound
|
||||||
log.msg('chemspider compound', level=log.DEBUG)
|
log.msg('chemspider compound', level=log.DEBUG)
|
||||||
return Request(url=searchurl, callback=self.parse_searchrequest)
|
return Request(url=searchurl, callback=self.parse_searchrequest)
|
||||||
|
@ -13,20 +13,31 @@ from FourmiCrawler.items import Result
|
|||||||
# Result item, but should be included eventually.
|
# Result item, but should be included eventually.
|
||||||
|
|
||||||
class NIST(Source):
|
class NIST(Source):
|
||||||
"""NIST Scraper plugin
|
"""
|
||||||
|
NIST Scraper plugin
|
||||||
This plugin manages searching for a chemical on the NIST website
|
This plugin manages searching for a chemical on the NIST website
|
||||||
and parsing the resulting page if the chemical exists on NIST.
|
and parsing the resulting page if the chemical exists on NIST.
|
||||||
"""
|
"""
|
||||||
website = "http://webbook.nist.gov/*"
|
website = "http://webbook\\.nist\\.gov/.*"
|
||||||
|
|
||||||
search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on'
|
search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on'
|
||||||
|
|
||||||
def __init__(self, config=None):
|
def __init__(self, config=None):
|
||||||
|
"""
|
||||||
|
Initialization of NIST scraper
|
||||||
|
:param config: configuration variables for this scraper, must contain
|
||||||
|
'reliability' key.
|
||||||
|
"""
|
||||||
Source.__init__(self, config)
|
Source.__init__(self, config)
|
||||||
self.ignore_list = set()
|
self.ignore_list = set()
|
||||||
|
|
||||||
def parse(self, response):
|
def parse(self, response):
|
||||||
|
"""
|
||||||
|
This function is called when a Response matching the variable
|
||||||
|
'website' is available for parsing the Response object.
|
||||||
|
:param response: The Scrapy Response object to be parsed
|
||||||
|
:return: a list of Result items and Request objects
|
||||||
|
"""
|
||||||
sel = Selector(response)
|
sel = Selector(response)
|
||||||
|
|
||||||
title = sel.xpath('head/title/text()').extract()[0]
|
title = sel.xpath('head/title/text()').extract()[0]
|
||||||
@ -51,6 +62,21 @@ class NIST(Source):
|
|||||||
log.msg('NIST symbol: |%s|, name: |%s|' % (symbol, name),
|
log.msg('NIST symbol: |%s|, name: |%s|' % (symbol, name),
|
||||||
level=log.DEBUG)
|
level=log.DEBUG)
|
||||||
|
|
||||||
|
requests.extend(self.parse_tables(sel, symbol_table))
|
||||||
|
|
||||||
|
return requests
|
||||||
|
|
||||||
|
def parse_tables(self, sel, symbol_table):
|
||||||
|
"""
|
||||||
|
This function identifies and distributes parsing of tables to other
|
||||||
|
functions below.
|
||||||
|
:param sel: A Selector object of the whole page
|
||||||
|
:param symbol_table: a dictionary containing translations of raw HTML
|
||||||
|
tags to human readable names
|
||||||
|
:return: a list of Result items and Requests
|
||||||
|
"""
|
||||||
|
requests = []
|
||||||
|
|
||||||
for table in sel.xpath('//table[@class="data"]'):
|
for table in sel.xpath('//table[@class="data"]'):
|
||||||
summary = table.xpath('@summary').extract()[0]
|
summary = table.xpath('@summary').extract()[0]
|
||||||
if summary == 'One dimensional data':
|
if summary == 'One dimensional data':
|
||||||
@ -81,8 +107,12 @@ class NIST(Source):
|
|||||||
return requests
|
return requests
|
||||||
|
|
||||||
def parse_generic_info(self, sel):
|
def parse_generic_info(self, sel):
|
||||||
"""Parses: synonyms, chemical formula, molecular weight, InChI,
|
"""
|
||||||
InChiKey, CAS number
|
This function parses: synonyms, chemical formula, molecular weight,
|
||||||
|
InChI, InChiKey, CAS number
|
||||||
|
:param sel: A Selector object of the entire page in the original
|
||||||
|
response
|
||||||
|
:return: a list of Result items
|
||||||
"""
|
"""
|
||||||
ul = sel.xpath('body/ul[li/strong="IUPAC Standard InChI:"]')
|
ul = sel.xpath('body/ul[li/strong="IUPAC Standard InChI:"]')
|
||||||
|
|
||||||
@ -121,15 +151,20 @@ class NIST(Source):
|
|||||||
return requests
|
return requests
|
||||||
|
|
||||||
def parse_aggregate_data(self, table, symbol_table):
|
def parse_aggregate_data(self, table, symbol_table):
|
||||||
"""Parses the table(s) which contain possible links to individual
|
"""
|
||||||
data points
|
This function parses the table(s) which contain possible links to
|
||||||
|
individual data points
|
||||||
|
:param table: a Selector object of the table to be parsed
|
||||||
|
:param symbol_table: a dictionary containing translations of raw HTML
|
||||||
|
tags to human readable names
|
||||||
|
:return: a list of Result items and Request objects
|
||||||
"""
|
"""
|
||||||
results = []
|
results = []
|
||||||
for tr in table.xpath('tr[td]'):
|
for tr in table.xpath('tr[td]'):
|
||||||
extra_data_url = tr.xpath('td[last()][a="Individual data points"]'
|
extra_data_url = tr.xpath('td[last()][a="Individual data points"]'
|
||||||
'/a/@href').extract()
|
'/a/@href').extract()
|
||||||
if extra_data_url:
|
if extra_data_url:
|
||||||
request = Request(url=self.website[:-1] + extra_data_url[0],
|
request = Request(url=self.website[:-2].replace("\\", "") + extra_data_url[0],
|
||||||
callback=self.parse_individual_datapoints)
|
callback=self.parse_individual_datapoints)
|
||||||
results.append(request)
|
results.append(request)
|
||||||
continue
|
continue
|
||||||
@ -155,14 +190,16 @@ class NIST(Source):
|
|||||||
return results
|
return results
|
||||||
|
|
||||||
def parse_transition_data(self, table, summary):
|
def parse_transition_data(self, table, summary):
|
||||||
"""Parses the table containing properties regarding phase changes"""
|
"""
|
||||||
|
This function parses the table containing properties regarding phase
|
||||||
|
changes
|
||||||
|
:param table: a Selector object of the table to be parsed
|
||||||
|
:param summary: the name of the property
|
||||||
|
:return: a list of Result items
|
||||||
|
"""
|
||||||
results = []
|
results = []
|
||||||
|
|
||||||
tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract())
|
unit = self.get_unit(table)
|
||||||
m = re.search(r'\((.*)\)', tr_unit)
|
|
||||||
unit = '!'
|
|
||||||
if m:
|
|
||||||
unit = m.group(1)
|
|
||||||
|
|
||||||
for tr in table.xpath('tr[td]'):
|
for tr in table.xpath('tr[td]'):
|
||||||
tds = tr.xpath('td/text()').extract()
|
tds = tr.xpath('td/text()').extract()
|
||||||
@ -176,18 +213,18 @@ class NIST(Source):
|
|||||||
return results
|
return results
|
||||||
|
|
||||||
def parse_generic_data(self, table, summary):
|
def parse_generic_data(self, table, summary):
|
||||||
"""Parses the common tables of 4 and 5 rows. Assumes they are of the
|
"""
|
||||||
|
Parses the common tables of 4 and 5 rows. Assumes they are of the
|
||||||
form:
|
form:
|
||||||
Symbol (unit)|Temperature (K)|Method|Reference|Comment
|
Symbol (unit)|Temperature (K)|Method|Reference|Comment
|
||||||
Symbol (unit)|Temperature (K)|Reference|Comment
|
Symbol (unit)|Temperature (K)|Reference|Comment
|
||||||
|
:param table: a Selector object of the table to be parsed
|
||||||
|
:param summary: the name of the property
|
||||||
|
:return: a list of Result items
|
||||||
"""
|
"""
|
||||||
results = []
|
results = []
|
||||||
|
|
||||||
tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract())
|
unit = self.get_unit(table)
|
||||||
m = re.search(r'\((.*)\)', tr_unit)
|
|
||||||
unit = '!'
|
|
||||||
if m:
|
|
||||||
unit = m.group(1)
|
|
||||||
|
|
||||||
for tr in table.xpath('tr[td]'):
|
for tr in table.xpath('tr[td]'):
|
||||||
tds = tr.xpath('td/text()').extract()
|
tds = tr.xpath('td/text()').extract()
|
||||||
@ -200,7 +237,13 @@ class NIST(Source):
|
|||||||
return results
|
return results
|
||||||
|
|
||||||
def parse_antoine_data(self, table, summary):
|
def parse_antoine_data(self, table, summary):
|
||||||
"""Parse table containing parameters for the Antione equation"""
|
"""
|
||||||
|
This function parses the table containing parameters for the Antione
|
||||||
|
equation
|
||||||
|
:param table: a Selector object of the table to be parsed
|
||||||
|
:param summary: the name of the property
|
||||||
|
:return: a list of Result items
|
||||||
|
"""
|
||||||
results = []
|
results = []
|
||||||
|
|
||||||
for tr in table.xpath('tr[td]'):
|
for tr in table.xpath('tr[td]'):
|
||||||
@ -215,7 +258,12 @@ class NIST(Source):
|
|||||||
return results
|
return results
|
||||||
|
|
||||||
def parse_individual_datapoints(self, response):
|
def parse_individual_datapoints(self, response):
|
||||||
"""Parses the page linked from aggregate data"""
|
"""
|
||||||
|
This function parses the 'individual data points' page linked from
|
||||||
|
the aggregate data table(s)
|
||||||
|
:param response: the Scrapy Response object to be parsed
|
||||||
|
:return: a list of Result items
|
||||||
|
"""
|
||||||
sel = Selector(response)
|
sel = Selector(response)
|
||||||
table = sel.xpath('//table[@class="data"]')[0]
|
table = sel.xpath('//table[@class="data"]')[0]
|
||||||
|
|
||||||
@ -228,11 +276,7 @@ class NIST(Source):
|
|||||||
name = m.group(1)
|
name = m.group(1)
|
||||||
condition = m.group(2)
|
condition = m.group(2)
|
||||||
|
|
||||||
tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract())
|
unit = self.get_unit(table)
|
||||||
m = re.search(r'\((.*)\)', tr_unit)
|
|
||||||
unit = '!'
|
|
||||||
if m:
|
|
||||||
unit = m.group(1)
|
|
||||||
|
|
||||||
for tr in table.xpath('tr[td]'):
|
for tr in table.xpath('tr[td]'):
|
||||||
tds = tr.xpath('td/text()').extract()
|
tds = tr.xpath('td/text()').extract()
|
||||||
@ -250,7 +294,25 @@ class NIST(Source):
|
|||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_unit(table):
|
||||||
|
tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract())
|
||||||
|
m = re.search(r'\((.*)\)', tr_unit)
|
||||||
|
unit = '!'
|
||||||
|
if m:
|
||||||
|
unit = m.group(1)
|
||||||
|
|
||||||
|
return unit
|
||||||
|
|
||||||
def newresult(self, attribute, value, conditions=''):
|
def newresult(self, attribute, value, conditions=''):
|
||||||
|
"""
|
||||||
|
This function abstracts from the Result item and provides default
|
||||||
|
values
|
||||||
|
:param attribute: the name of the attribute
|
||||||
|
:param value: the value of the attribute
|
||||||
|
:param conditions: optional conditions regarding the value
|
||||||
|
:return: A Result item
|
||||||
|
"""
|
||||||
return Result(
|
return Result(
|
||||||
{
|
{
|
||||||
'attribute': attribute,
|
'attribute': attribute,
|
||||||
@ -261,7 +323,12 @@ class NIST(Source):
|
|||||||
})
|
})
|
||||||
|
|
||||||
def new_compound_request(self, compound):
|
def new_compound_request(self, compound):
|
||||||
|
"""
|
||||||
|
This function is called when a new synonym is returned to the spider
|
||||||
|
to generate new requests
|
||||||
|
:param compound: the name of the compound to search for
|
||||||
|
"""
|
||||||
if compound not in self.ignore_list:
|
if compound not in self.ignore_list:
|
||||||
self.ignore_list.update(compound)
|
self.ignore_list.update(compound)
|
||||||
return Request(url=self.website[:-1] + self.search % compound,
|
return Request(url=self.website[:-2].replace("\\", "") + self.search % compound,
|
||||||
callback=self.parse)
|
callback=self.parse)
|
||||||
|
@ -1,9 +1,11 @@
|
|||||||
|
import re
|
||||||
|
|
||||||
from scrapy.http import Request
|
from scrapy.http import Request
|
||||||
from scrapy import log
|
from scrapy import log
|
||||||
from source import Source
|
|
||||||
from scrapy.selector import Selector
|
from scrapy.selector import Selector
|
||||||
|
|
||||||
|
from source import Source
|
||||||
from FourmiCrawler.items import Result
|
from FourmiCrawler.items import Result
|
||||||
import re
|
|
||||||
|
|
||||||
|
|
||||||
class PubChem(Source):
|
class PubChem(Source):
|
||||||
@ -13,10 +15,10 @@ class PubChem(Source):
|
|||||||
including sources of the values of properties.
|
including sources of the values of properties.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
#PubChem has its data on compound name, properties and their values on different html pages, so different URLs used
|
# PubChem has its data on compound name, properties and their values on different html pages, so different URLs used
|
||||||
website = 'https://*.ncbi.nlm.nih.gov/*'
|
website = 'http://.*\\.ncbi\\.nlm\\.nih\\.gov/.*'
|
||||||
website_www = 'https://www.ncbi.nlm.nih.gov/*'
|
website_www = 'http://www.ncbi.nlm.nih.gov/*'
|
||||||
website_pubchem = 'https://pubchem.ncbi.nlm.nih.gov/*'
|
website_pubchem = 'http://pubchem.ncbi.nlm.nih.gov/.*'
|
||||||
search = 'pccompound?term=%s'
|
search = 'pccompound?term=%s'
|
||||||
data_url = 'toc/summary_toc.cgi?tocid=27&cid=%s'
|
data_url = 'toc/summary_toc.cgi?tocid=27&cid=%s'
|
||||||
|
|
||||||
@ -49,14 +51,15 @@ class PubChem(Source):
|
|||||||
self._spider.get_synonym_requests(synonym)
|
self._spider.get_synonym_requests(synonym)
|
||||||
log.msg('Raw synonyms found: %s' % raw_synonyms, level=log.DEBUG)
|
log.msg('Raw synonyms found: %s' % raw_synonyms, level=log.DEBUG)
|
||||||
|
|
||||||
n = re.search(r'cid=(\d+)',response.url)
|
n = re.search(r'cid=(\d+)', response.url)
|
||||||
if n:
|
if n:
|
||||||
cid = n.group(1)
|
cid = n.group(1)
|
||||||
log.msg('cid: %s' % cid, level=log.DEBUG) #getting the right id of the compound with which it can reach
|
log.msg('cid: %s' % cid, level=log.DEBUG) # getting the right id of the compound with which it can reach
|
||||||
# the seperate html page which contains the properties and their values
|
# the seperate html page which contains the properties and their values
|
||||||
|
|
||||||
#using this cid to get the right url and scrape it
|
# using this cid to get the right url and scrape it
|
||||||
requests.append(Request(url=self.website_pubchem[:-1] + self.data_url % cid, callback=self.parse_data))
|
requests.append(
|
||||||
|
Request(url=self.website_pubchem[:-2].replace("\\", "") + self.data_url % cid, callback=self.parse_data))
|
||||||
return requests
|
return requests
|
||||||
|
|
||||||
def parse_data(self, response):
|
def parse_data(self, response):
|
||||||
@ -72,22 +75,22 @@ class PubChem(Source):
|
|||||||
props = sel.xpath('//div')
|
props = sel.xpath('//div')
|
||||||
|
|
||||||
for prop in props:
|
for prop in props:
|
||||||
prop_name = ''.join(prop.xpath('b/text()').extract()) # name of property that it is parsing
|
prop_name = ''.join(prop.xpath('b/text()').extract()) # name of property that it is parsing
|
||||||
if prop.xpath('a'): # parsing for single value in property
|
if prop.xpath('a'): # parsing for single value in property
|
||||||
prop_source = ''.join(prop.xpath('a/@title').extract())
|
prop_source = ''.join(prop.xpath('a/@title').extract())
|
||||||
prop_value = ''.join(prop.xpath('a/text()').extract())
|
prop_value = ''.join(prop.xpath('a/text()').extract())
|
||||||
new_prop = Result({
|
new_prop = Result({
|
||||||
'attribute': prop_name,
|
'attribute': prop_name,
|
||||||
'value': prop_value,
|
'value': prop_value,
|
||||||
'source': prop_source,
|
'source': prop_source,
|
||||||
'reliability': 'Unknown',
|
'reliability': self.cfg['reliability'],
|
||||||
'conditions': ''
|
'conditions': ''
|
||||||
})
|
})
|
||||||
log.msg('PubChem prop: |%s| |%s| |%s|' %
|
log.msg('PubChem prop: |%s| |%s| |%s|' %
|
||||||
(new_prop['attribute'], new_prop['value'],
|
(new_prop['attribute'], new_prop['value'],
|
||||||
new_prop['source']), level=log.DEBUG)
|
new_prop['source']), level=log.DEBUG)
|
||||||
requests.append(new_prop)
|
requests.append(new_prop)
|
||||||
elif prop.xpath('ul'): # parsing for multiple values (list) in property
|
elif prop.xpath('ul'): # parsing for multiple values (list) in property
|
||||||
prop_values = prop.xpath('ul//li')
|
prop_values = prop.xpath('ul//li')
|
||||||
for prop_li in prop_values:
|
for prop_li in prop_values:
|
||||||
prop_value = ''.join(prop_li.xpath('a/text()').extract())
|
prop_value = ''.join(prop_li.xpath('a/text()').extract())
|
||||||
@ -96,16 +99,51 @@ class PubChem(Source):
|
|||||||
'attribute': prop_name,
|
'attribute': prop_name,
|
||||||
'value': prop_value,
|
'value': prop_value,
|
||||||
'source': prop_source,
|
'source': prop_source,
|
||||||
'reliability': 'Unknown',
|
'reliability': self.cfg['reliability'],
|
||||||
'conditions': ''
|
'conditions': ''
|
||||||
})
|
})
|
||||||
log.msg('PubChem prop: |%s| |%s| |%s|' %
|
log.msg('PubChem prop: |%s| |%s| |%s|' %
|
||||||
(new_prop['attribute'], new_prop['value'],
|
(new_prop['attribute'], new_prop['value'],
|
||||||
new_prop['source']), level=log.DEBUG)
|
new_prop['source']), level=log.DEBUG)
|
||||||
requests.append(new_prop)
|
requests.append(new_prop)
|
||||||
|
|
||||||
return requests
|
return requests
|
||||||
|
|
||||||
|
def parse_searchrequest(self, response):
|
||||||
|
"""
|
||||||
|
This function parses the response to the new_compound_request Request
|
||||||
|
:param response: the Response object to be parsed
|
||||||
|
:return: A Request for the compound page or what self.parse returns in
|
||||||
|
case the search request forwarded to the compound page
|
||||||
|
"""
|
||||||
|
|
||||||
|
# check if pubchem forwarded straight to compound page
|
||||||
|
m = re.match(self.website_pubchem, response.url)
|
||||||
|
if m:
|
||||||
|
log.msg('PubChem search forwarded to compound page',
|
||||||
|
level=log.DEBUG)
|
||||||
|
return self.parse(response)
|
||||||
|
|
||||||
|
sel = Selector(response)
|
||||||
|
|
||||||
|
results = sel.xpath('//div[@class="rsltcont"]')
|
||||||
|
if results:
|
||||||
|
url = results[0].xpath('div/p/a[1]/@href')
|
||||||
|
else:
|
||||||
|
log.msg('PubChem search found nothing or xpath failed',
|
||||||
|
level=log.DEBUG)
|
||||||
|
return None
|
||||||
|
|
||||||
|
if url:
|
||||||
|
url = 'http:' + ''.join(url[0].extract())
|
||||||
|
log.msg('PubChem compound page: %s' % url, level=log.DEBUG)
|
||||||
|
else:
|
||||||
|
log.msg('PubChem search found results, but no url in first result',
|
||||||
|
level=log.DEBUG)
|
||||||
|
return None
|
||||||
|
|
||||||
|
return Request(url=url, callback=self.parse)
|
||||||
|
|
||||||
def new_compound_request(self, compound):
|
def new_compound_request(self, compound):
|
||||||
return Request(url=self.website_www[:-1] + self.search % compound, callback=self.parse)
|
return Request(url=self.website_www[:-1] + self.search % compound,
|
||||||
|
callback=self.parse_searchrequest)
|
||||||
|
@ -15,7 +15,7 @@ class WikipediaParser(Source):
|
|||||||
It also returns requests with other external sources which contain information on parsed subject.
|
It also returns requests with other external sources which contain information on parsed subject.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
website = "http://en.wikipedia.org/wiki/*"
|
website = "http://en\\.wikipedia\\.org/wiki/.*"
|
||||||
__spider = None
|
__spider = None
|
||||||
searched_compounds = []
|
searched_compounds = []
|
||||||
|
|
||||||
@ -123,7 +123,7 @@ class WikipediaParser(Source):
|
|||||||
return items
|
return items
|
||||||
|
|
||||||
def new_compound_request(self, compound):
|
def new_compound_request(self, compound):
|
||||||
return Request(url=self.website[:-1] + compound, callback=self.parse)
|
return Request(url=self.website[:-2].replace("\\", "") + compound, callback=self.parse)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def clean_items(items):
|
def clean_items(items):
|
||||||
|
@ -3,7 +3,7 @@ from scrapy import log
|
|||||||
|
|
||||||
|
|
||||||
class Source:
|
class Source:
|
||||||
website = "http://something/*" # Regex of URI's the source is able to parse
|
website = "http://something/.*" # Regex of URI's the source is able to parse
|
||||||
_spider = None
|
_spider = None
|
||||||
|
|
||||||
def __init__(self, config=None):
|
def __init__(self, config=None):
|
||||||
@ -30,7 +30,7 @@ class Source:
|
|||||||
:param compound: A compound name.
|
:param compound: A compound name.
|
||||||
:return: A new Scrapy Request
|
:return: A new Scrapy Request
|
||||||
"""
|
"""
|
||||||
# return Request(url=self.website[:-1] + compound, callback=self.parse)
|
# return Request(url=self.website[:-2].replace("\\", "") + compound, callback=self.parse)
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def set_spider(self, spider):
|
def set_spider(self, spider):
|
||||||
|
@ -34,8 +34,9 @@ class FourmiSpider(Spider):
|
|||||||
"""
|
"""
|
||||||
for source in self._sources:
|
for source in self._sources:
|
||||||
if re.match(source.website, response.url):
|
if re.match(source.website, response.url):
|
||||||
log.msg("Url: " + response.url + " -> Source: " + source.website, level=log.DEBUG)
|
log.msg("URL: " + response.url + " -> Source: " + source.website, level=log.DEBUG)
|
||||||
return source.parse(response)
|
return source.parse(response)
|
||||||
|
log.msg("URL: " + response.url + " -> No compatible source", level=log.INFO)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def get_synonym_requests(self, compound, force=False):
|
def get_synonym_requests(self, compound, force=False):
|
||||||
|
10
GUI.cfg.sample
Normal file
10
GUI.cfg.sample
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
[GUI]
|
||||||
|
# Personalize options in your User Interface
|
||||||
|
|
||||||
|
# Commonly used parameters are listed in the GUI for easy selection
|
||||||
|
CommonParameters = Weight, Polarity, Viscosity, Solubility, Name
|
||||||
|
|
||||||
|
# Parameters that are always used in the search
|
||||||
|
AlwaysParameters = Name
|
||||||
|
|
||||||
|
OutputTypes = csv, json, jsonlines, xml
|
1
GUI/__init__.py
Normal file
1
GUI/__init__.py
Normal file
@ -0,0 +1 @@
|
|||||||
|
import gui
|
30
GUI/configImporter.py
Normal file
30
GUI/configImporter.py
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
import ConfigParser
|
||||||
|
|
||||||
|
|
||||||
|
class ConfigImporter():
|
||||||
|
def __init__(self, filename):
|
||||||
|
"""Read the filename into the parser."""
|
||||||
|
self.filename = filename
|
||||||
|
self.parser = ConfigParser.ConfigParser()
|
||||||
|
self.parser.read(self.filename)
|
||||||
|
|
||||||
|
def load_common_attributes(self):
|
||||||
|
"""Loads common attributes from the initialized file."""
|
||||||
|
try:
|
||||||
|
return self.parser.get('GUI', 'CommonParameters')
|
||||||
|
except:
|
||||||
|
return 'One, Two, Three'
|
||||||
|
|
||||||
|
def load_output_types(self):
|
||||||
|
"""Loads output types from the initialized file."""
|
||||||
|
try:
|
||||||
|
return self.parser.get('GUI', 'OutputTypes')
|
||||||
|
except:
|
||||||
|
return 'csv'
|
||||||
|
|
||||||
|
def load_always_attributes(self):
|
||||||
|
"""Loads attributes that are always searched for from the initialized file."""
|
||||||
|
try:
|
||||||
|
return self.parser.get('GUI', 'AlwaysParameters')
|
||||||
|
except:
|
||||||
|
return 'Name, Weight'
|
196
GUI/gui.py
Normal file
196
GUI/gui.py
Normal file
@ -0,0 +1,196 @@
|
|||||||
|
from Tkinter import *
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
from tkFileDialog import asksaveasfilename
|
||||||
|
|
||||||
|
from configImporter import *
|
||||||
|
|
||||||
|
|
||||||
|
class GUI():
|
||||||
|
def __init__(self, search, config_file='GUI.cfg', sourceloader=None, in_source=True):
|
||||||
|
"""Boots the window, configuration."""
|
||||||
|
if not in_source:
|
||||||
|
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
config_file = current_dir + '../' + config_file
|
||||||
|
if not os.path.isfile(config_file):
|
||||||
|
try:
|
||||||
|
shutil.copyfile(os.path.dirname(os.path.abspath(__file__)) + "/../GUI.cfg.sample", config_file)
|
||||||
|
except IOError:
|
||||||
|
print "GUI configuration couldn't be found and couldn't be created."
|
||||||
|
sys.exit()
|
||||||
|
self.configurator = ConfigImporter(config_file)
|
||||||
|
self.sourceloader = sourceloader
|
||||||
|
self.finish_with_search = False
|
||||||
|
self.values = {}
|
||||||
|
self.required_variables = ['substance']
|
||||||
|
self.search = search
|
||||||
|
self.window, self.variables = self.generate_window(self.load_common_attributes(), self.load_output_types())
|
||||||
|
|
||||||
|
def load_common_attributes(self):
|
||||||
|
"""Calls the configuration parser for common attributes."""
|
||||||
|
return [x.strip() for x in self.configurator.load_common_attributes().split(',')]
|
||||||
|
|
||||||
|
def load_output_types(self):
|
||||||
|
"""Calls the configuration parser for output types."""
|
||||||
|
return [x.strip() for x in self.configurator.load_output_types().split(',')]
|
||||||
|
|
||||||
|
def load_always_attributes(self):
|
||||||
|
"""Calls the configuration parser for attributes that are always used."""
|
||||||
|
return ','.join([x.strip() for x in self.configurator.load_always_attributes().split(',')])
|
||||||
|
|
||||||
|
def set_output(self):
|
||||||
|
self.variable_output_name.set(asksaveasfilename())
|
||||||
|
self.button_output_name.config(text=self.variable_output_name.get())
|
||||||
|
|
||||||
|
def generate_window(self, common_attributes, output_types):
|
||||||
|
"""Creates all widgets and variables in the window."""
|
||||||
|
window = Tk()
|
||||||
|
window.wm_title("Fourmi Crawler")
|
||||||
|
|
||||||
|
variables = {}
|
||||||
|
|
||||||
|
variable_substance = StringVar(window)
|
||||||
|
frame_substance = Frame(window)
|
||||||
|
label_substance = Label(frame_substance, text="Substance: ")
|
||||||
|
input_substance = Entry(frame_substance, font=("Helvetica", 12), width=25, textvariable=variable_substance)
|
||||||
|
variables.update({"substance": variable_substance})
|
||||||
|
frame_substance.pack(side=TOP)
|
||||||
|
label_substance.pack()
|
||||||
|
input_substance.pack()
|
||||||
|
input_substance.focus()
|
||||||
|
|
||||||
|
frame_all_attributes = Frame(window)
|
||||||
|
frame_selecting_attributes = Frame(frame_all_attributes)
|
||||||
|
frame_new_attributes = Frame(frame_selecting_attributes)
|
||||||
|
label_new_attributes = Label(frame_new_attributes, text="Parameters: ")
|
||||||
|
input_new_attributes = Text(frame_new_attributes, font=("Helvetica", 8), width=25, height=7, padx=5, pady=5)
|
||||||
|
variables.update({"new_attributes": input_new_attributes})
|
||||||
|
frame_new_attributes.pack(side=LEFT)
|
||||||
|
label_new_attributes.pack()
|
||||||
|
input_new_attributes.pack()
|
||||||
|
|
||||||
|
frame_common_attributes = Frame(frame_selecting_attributes)
|
||||||
|
label_common_attributes = Label(frame_common_attributes, text="Common Parameters: ")
|
||||||
|
input_common_attributes = Listbox(frame_common_attributes, selectmode=MULTIPLE, height=7)
|
||||||
|
scrollbar_common_attributes = Scrollbar(frame_common_attributes)
|
||||||
|
input_common_attributes.config(yscrollcommand=scrollbar_common_attributes.set)
|
||||||
|
scrollbar_common_attributes.config(command=input_common_attributes.yview)
|
||||||
|
if common_attributes and len(common_attributes) > 0:
|
||||||
|
input_common_attributes.insert(END, *common_attributes)
|
||||||
|
variables.update({"common_attributes": input_common_attributes})
|
||||||
|
frame_common_attributes.pack(side=RIGHT)
|
||||||
|
label_common_attributes.pack(side=TOP)
|
||||||
|
input_common_attributes.pack(side=LEFT)
|
||||||
|
scrollbar_common_attributes.pack(side=RIGHT, fill=Y)
|
||||||
|
frame_selecting_attributes.pack()
|
||||||
|
|
||||||
|
frame_last = Frame(window)
|
||||||
|
search_button = Button(frame_last, text="Start search", command=self.prepare_search)
|
||||||
|
cancel_button = Button(frame_last, text="Cancel", command=window.destroy)
|
||||||
|
frame_last.pack(side=BOTTOM)
|
||||||
|
search_button.pack(side=LEFT)
|
||||||
|
cancel_button.pack(side=RIGHT)
|
||||||
|
|
||||||
|
frame_name = Frame(window)
|
||||||
|
frame_output_name = Frame(frame_name)
|
||||||
|
label_output_name = Label(frame_output_name, text='Output file:')
|
||||||
|
self.variable_output_name = StringVar()
|
||||||
|
self.variable_output_name.set('results.csv')
|
||||||
|
variables.update({'output_name':self.variable_output_name})
|
||||||
|
self.button_output_name = Button(frame_output_name, command=self.set_output, text="Select file")
|
||||||
|
frame_output_name.pack(side=LEFT)
|
||||||
|
label_output_name.pack()
|
||||||
|
self.button_output_name.pack()
|
||||||
|
frame_name.pack(side=BOTTOM)
|
||||||
|
|
||||||
|
|
||||||
|
frame_checkboxes = Frame(window)
|
||||||
|
frame_checkbox_attributes = Frame(frame_checkboxes)
|
||||||
|
variable_all_attributes = BooleanVar()
|
||||||
|
variable_all_attributes.set(True)
|
||||||
|
input_all_attributes = Checkbutton(frame_checkbox_attributes, text="Search ALL parameters",
|
||||||
|
variable=variable_all_attributes)
|
||||||
|
variables.update({"all_attributes": variable_all_attributes})
|
||||||
|
frame_checkbox_attributes.pack(side=LEFT)
|
||||||
|
input_all_attributes.pack()
|
||||||
|
|
||||||
|
frame_logging = Frame(frame_checkboxes)
|
||||||
|
variable_logging = BooleanVar()
|
||||||
|
variable_logging.set(False)
|
||||||
|
input_logging = Checkbutton(frame_logging, text="Verbose logging", variable=variable_logging)
|
||||||
|
variables.update({'logging':variable_logging})
|
||||||
|
frame_logging.pack(side=RIGHT)
|
||||||
|
frame_checkboxes.pack(side=BOTTOM)
|
||||||
|
input_logging.pack()
|
||||||
|
frame_all_attributes.pack()
|
||||||
|
|
||||||
|
return window, variables
|
||||||
|
|
||||||
|
def prepare_search(self):
|
||||||
|
"""Saves the values from the window for later retrieval."""
|
||||||
|
variables = self.variables
|
||||||
|
values = {}
|
||||||
|
|
||||||
|
values.update({"Always attributes": self.load_always_attributes()})
|
||||||
|
for name, var in variables.iteritems():
|
||||||
|
if var.__class__ is StringVar:
|
||||||
|
values.update({name: var.get()})
|
||||||
|
elif var.__class__ is BooleanVar:
|
||||||
|
values.update({name: var.get()})
|
||||||
|
elif var.__class__ is Text:
|
||||||
|
values.update({name: str(var.get("1.0", END)).strip()})
|
||||||
|
elif var.__class__ is Listbox:
|
||||||
|
values.update({name: ", ".join([var.get(int(i)) for i in var.curselection()])})
|
||||||
|
else:
|
||||||
|
print "No known class, {}, {}".format(name, var)
|
||||||
|
|
||||||
|
values.update({'output_name':self.variable_output_name.get()})
|
||||||
|
values.update({'output_type':self.check_output_type(values.get('output_name'))})
|
||||||
|
|
||||||
|
self.values = values
|
||||||
|
if all([values.get(i) != '' for i in self.required_variables]):
|
||||||
|
self.finish_with_search = True
|
||||||
|
self.window.destroy()
|
||||||
|
else:
|
||||||
|
self.finish_with_search = False
|
||||||
|
#tkMessageBox.showinfo('Not all required information was entered!')
|
||||||
|
|
||||||
|
def execute_search(self):
|
||||||
|
"""Calls the Fourmi crawler with the values from the GUI"""
|
||||||
|
if self.values.get('all_attributes'):
|
||||||
|
attributes = ".*"
|
||||||
|
else:
|
||||||
|
attribute_types = ['attributes', 'Common attributes', 'Always attributes']
|
||||||
|
attributes = ','.join([str(self.values.get(attribute)) for attribute in attribute_types])
|
||||||
|
output_file = "file://" + str(self.values.get('output_name')) #Dealing with absolute paths
|
||||||
|
|
||||||
|
arguments = {'--attributes': attributes,
|
||||||
|
'--exclude': None,
|
||||||
|
'--format': self.values.get('output_type'),
|
||||||
|
'--help': False,
|
||||||
|
'--include': None,
|
||||||
|
'--log': 'log.txt',
|
||||||
|
'--output': output_file,
|
||||||
|
'-v': 0 if self.values.get('logging') else 3,
|
||||||
|
'--version': False,
|
||||||
|
'<compound>': self.values.get('substance'),
|
||||||
|
'list': False,
|
||||||
|
'search': True}
|
||||||
|
|
||||||
|
self.search(arguments, self.sourceloader)
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
"""Starts the window and the search."""
|
||||||
|
self.window.mainloop()
|
||||||
|
if self.finish_with_search:
|
||||||
|
self.execute_search()
|
||||||
|
|
||||||
|
def check_output_type(self, filename):
|
||||||
|
parts = str(filename).split('.')
|
||||||
|
output_types = self.load_output_types()
|
||||||
|
extension = parts[-1]
|
||||||
|
|
||||||
|
for type in output_types:
|
||||||
|
if extension==type:
|
||||||
|
return extension
|
||||||
|
return output_types[0]
|
@ -48,7 +48,6 @@ __Main goals:__
|
|||||||
- Build an graphical user interface(GUI) as alternative for the command line
|
- Build an graphical user interface(GUI) as alternative for the command line
|
||||||
interface(CLI). (Assignee: Harmen)
|
interface(CLI). (Assignee: Harmen)
|
||||||
- Compiling the source into an windows executable. (Assignee: Bas)
|
- Compiling the source into an windows executable. (Assignee: Bas)
|
||||||
- Create an module to gather data from PubChem. (Assignee: Nout)
|
|
||||||
|
|
||||||
__Side goals:__
|
__Side goals:__
|
||||||
|
|
||||||
|
97
SIGNED.md
97
SIGNED.md
@ -3,19 +3,19 @@
|
|||||||
-----BEGIN PGP SIGNATURE-----
|
-----BEGIN PGP SIGNATURE-----
|
||||||
Version: GnuPG v1.4.11 (GNU/Linux)
|
Version: GnuPG v1.4.11 (GNU/Linux)
|
||||||
|
|
||||||
iQIcBAABAgAGBQJTn3GgAAoJEJrQ9RIUCT6/CI4P/RSAQrd6JugGZoQu/gNdW6eB
|
iQIcBAABAgAGBQJTpMZAAAoJEJrQ9RIUCT6/Hf8P/AyX9ZD5zj6rBi2CwDOTs5aa
|
||||||
MYCybqYGZiieVhUaGOnFNVlp68YpXH+sP/Uc6hXEX30UQEsDmhMeT5NA7ZMS+zJ9
|
flVqw9syvdqTzVfXQaR4UrCSOuyuOeAkiqub0BMjxyCurqAwN/SCPf3uOJ/tGXmt
|
||||||
MNHGQdJq22lGb3+VoVBV4RTMdkQXOXvx6p5biskjIEtM3tfTxP529GvAX2TFUNnt
|
ZPtYVHjevJ4mbojLhZiJ2av8LC9VOh3Zl+reR3L2cLuBD4rVSrfUMJtczbbtNlk+
|
||||||
gGWk28EDr30M95XwDxwWo+57Xv8VtSb3VSvXEbrdwGYf8EoQo9oPtzYQ0YcdupcC
|
+mczRcTpzNvHQW6mKqyUoKn8xqNnLC7C+p5ybNZ5EADUfoKIF1xyTN6je6fpYZ1U
|
||||||
ET8bukYVcwpAjoTnPlEy89TiHHohwmimr2ASXeQ64Ks5wfjzcF7NENCAmaAfR+KI
|
IHxiUzeOvfX9ohmbfnfkpkuSll1nUJWsTgUPKhthJuxEhwCQ1xMdWhxfcyZJaMT2
|
||||||
VLLuGqdWMBx1ewVuAXTCZ0Mga/kBoRUaO0PC13UmL8LhhZY9Z3cwD4UnPU35/RQi
|
Pxgo8C8S6lzAk4PxBRBoePjgWAeaFmbr317WXHvw6SSHPIdzToKZgDiDC5LWvKxb
|
||||||
IbLfQcZHf/gEvyMeiTYCsyWpm+/xxn1+EfHol4/Q9VSXzZgRBX05Ik6tqeCvjdgG
|
RRdLZ6w7tg0/FSUexekrUafGT8Je0oIoLUQlNaEQzrPNhDpma1uHFfZg0vb2m4Hq
|
||||||
4PyHBaJTTm/HfMNdg3mr1mbyjTv5UxglEyPv+Y4NdfoVfepkXsXbzvNSyVffZ3Bw
|
WHLLKTCr6FMczhP1TmuIEtdjKtymT+rO+Ls4ciw+654R7MtBYcmTr+RqmAd+GadJ
|
||||||
UaFp7KzIC4Jugdpv63FleiAdDY0+iZ5shH86wD1+HJ0/a87kn5Ao1yESby7J7U+f
|
vJNmGDod2oPwCydEps8bYAbksqRhMmk3xwco/g6dWYh5/+1GzCr80J7fYpqtoPFH
|
||||||
poZQYeMFeuC0T5hY/3iYoyvZ68oH918ESESiucSulp5BvfwuqGL2+xo5uJIwGYXE
|
V5qKyDQovF5jPlb/buq4mH8XYVT1z4Sx8azKVctMLig57zRnvN0WyskpT09oY7dK
|
||||||
3IDQC7xbA14JHX86IVJlSHAD33iWyiC+5yjw4/bRRVl37KPsLdHiXH3YIRnF5I2I
|
TPvIqwTixekndYLcM3QacVq/NhVOOQPFvD0PwU18eKs4EfD2L7iWd2XjV9Az++aD
|
||||||
ZbM/uDYyJdZbBe4UoCoF
|
jUY6EwEuOzDCexWP4eM8
|
||||||
=AMhi
|
=h6TK
|
||||||
-----END PGP SIGNATURE-----
|
-----END PGP SIGNATURE-----
|
||||||
|
|
||||||
```
|
```
|
||||||
@ -27,38 +27,45 @@ ZbM/uDYyJdZbBe4UoCoF
|
|||||||
#### Expect
|
#### Expect
|
||||||
|
|
||||||
```
|
```
|
||||||
size exec file contents
|
size exec file contents
|
||||||
./
|
./
|
||||||
375 .gitignore d2e475a6a4fa51422cac0a07495914e776858fb9ab9c8937a4d491a3e042d6b1
|
412 .gitignore 25059da2ee328837ece01b979cd5c1083ed1679372f06c14c1c58035d8120614
|
||||||
464 .travis.yml 3063ba078607b8d16bd6467afc15fbbaa4b26c1e30be5ce7cef453cfccbaa95c
|
548 .travis.yml 7f11bc58a8e94276ef949afeb107f9f1e184c0dbb84f821705ea2245902ed546
|
||||||
428 Changelog.md c7791d1914ddca9ff1549d90468a79787a7feafe94cecd756e3d7cbd4bcbc7df
|
846 Changelog.md 345f9aea4812b37b1b2714703ea0d5edd27414c0f839ec3e322450ad5ec5c6ed
|
||||||
FourmiCrawler/
|
FourmiCrawler/
|
||||||
0 __init__.py e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
|
0 __init__.py e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
|
||||||
304 items.py b00d49a3d53fa13306c7f8b023adb93ab88423c4fce46600689814f6b02bb806
|
304 items.py b00d49a3d53fa13306c7f8b023adb93ab88423c4fce46600689814f6b02bb806
|
||||||
2178 pipelines.py f9b7b84938060751e15e45de5133dffe50c798bff2a20019206fe7c9d677ad49
|
2178 pipelines.py f9b7b84938060751e15e45de5133dffe50c798bff2a20019206fe7c9d677ad49
|
||||||
914 settings.py 0be2eaf8e83e85ed27754c896421180fc80cb5ce44449aa9f1048e465d1a96f2
|
677 settings.py f1e7d21b899ffc2523516c0ebe67d967dc62495b90c2fe34651042a3049fcd94
|
||||||
sources/
|
sources/
|
||||||
9991 ChemSpider.py 847013e34c5c3683ec66a337837287512b4bab9fbea2ece12e4130ab0dbf264d
|
12103 ChemSpider.py f647d70acf9b3f1ee7bde75586aa45156331f977ca7fe836ceac4477a2c0d4ce
|
||||||
9898 NIST.py 97abc84fce85c47b789822715a1945ab84cc052a32340c861141c1af66bab644
|
12400 NIST.py cdb4c423355ac8fb1097197a9f8df44f667925a785c6bae7c583820da08908ee
|
||||||
4754 PubChem.py 58ed4c92519e385f2768cf8034b006b18f8a21632cb1c5a0849b1a329a8c6ffb
|
6121 PubChem.py 8f8ad40459090b818a384a202e739fe4696a04154df2b8419aee896b0fa02481
|
||||||
6907 WikipediaParser.py 5d6de911c773129a34b76c40a9b547aafc67644a15f39cd0be6afc7a16fb0f97
|
6930 WikipediaParser.py ae9f57bbf2aad9c371abcd143fd2dda5995a196cb700734a5035dd94b1988870
|
||||||
0 __init__.py e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
|
0 __init__.py e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
|
||||||
1262 source.py 16c4cdfca849b7dc2bc89d7a6f7ad021f4aa1d04234394312f1d0edf0fd9c5a4
|
1281 source.py 7927fda259ff2c8096fa526db1f08586de6e04473a491e19a07b092fdeed81fc
|
||||||
3026 spider.py 1ffba2512988b7a6b535a4a31a4ef688ece4f8c595c3d50355c34ef46b23e44a
|
3111 spider.py ec7c946907fea10c17ee6dd88a506f3e3bf2cd748e3eb09200487fcec2ae7ba3
|
||||||
1081 LICENSE 36951e5f1910bad3e008ab7228f35ad8933192e52d3c3ae6a5e875765e27192c
|
GUI/
|
||||||
3965 README.md d21236d6a175be28ef8e2fee8a256e95b6a513163e3f1071c26c62e9093db7f3
|
11 __init__.py 40567015c415e853210425c1b4f3834dbc2a3165e3713e04dd3424b79bc90aa3
|
||||||
3676 x fourmi.py 2ff89f97fd2a49d08417d9ab6cf08e88944d0c45f54ec84550b530be48676c23
|
940 configImporter.py 5d731d63a3117b25b7e556a746a1dd5b16e8cbb60e57be46de333c31c8c00271
|
||||||
261 scrapy.cfg 624c068fd06303daa65b8e0d0d3ef88ac1f123be2694ef5b4f3f9a9dcd983f85
|
8776 gui.py 20b2220bc3ca55ebfd6d04e8c0bebbf1ae316c85a54db60b8fc02d22642f19d5
|
||||||
tests/
|
299 GUI.cfg.sample 4ee27f7099d588c21358cd645a21621e631d80712f1b514dad898faa5fee2483
|
||||||
1 __init__.py 01ba4719c80b6fe911b091a7c05124b64eeece964e09c058ef8f9805daca546b
|
1081 LICENSE 36951e5f1910bad3e008ab7228f35ad8933192e52d3c3ae6a5e875765e27192c
|
||||||
2837 test_configurator.py 4a0eb6e7121eb09a63ab5cb797570d1a42080c5346c3b8b365da56eefa599e80
|
3900 README.md f4a1e3ea1700d2b415acfad661cb45f960fe8e8ffbe98dbecb6c7ed071a101ac
|
||||||
1892 test_pipeline.py 387a336b0f36722a20e712aa033e5771c44f9e92561dd73acffd53d622c52031
|
3846 x fourmi.py f0b11f5f153f96f6af2e504cdf369e43c04316752de131a659eb6246fd80212a
|
||||||
1260 test_sourceloader.py b108b4b80adcdb7401273a9823b1f1a19eb5178776186eb5a9976aed8b1ee869
|
261 scrapy.cfg 624c068fd06303daa65b8e0d0d3ef88ac1f123be2694ef5b4f3f9a9dcd983f85
|
||||||
2113 test_spider.py 300f280377b522737be0d8e4a80031ab118a4011bdbb92131e9c400fcdab6299
|
416 sources.cfg.sample 11cd0fc18693da17883c98d25a384ae1b6158adfef13778b6dd02b878f6b8a70
|
||||||
utils/
|
tests/
|
||||||
0 __init__.py e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
|
107 __init__.py ce90e54e58a0912cadbe3adcf5166dc72477bf9ce289bf427f8e2f5b25406670
|
||||||
3552 configurator.py e2b7e0ee6c1fef4373785dfe5df8ec6950f31ce6a5d9632b69a66ea3d1eaf921
|
2870 test_configurator.py 318d542b1cda5075a2a9a6be97e9e7a79372ee58e1ab3014c161534094f7364d
|
||||||
2537 sourceloader.py f5a5ac2a6aba0658dbe11361f465caabcf3c06c5c8dc9a631874211cc19d2d37
|
1315 test_gui.py 0fb95d0b542765bf52bcebb037bf2ed1299209beab23448af741a93c9fbb1ca8
|
||||||
|
1892 test_pipeline.py 387a336b0f36722a20e712aa033e5771c44f9e92561dd73acffd53d622c52031
|
||||||
|
1260 test_sourceloader.py b108b4b80adcdb7401273a9823b1f1a19eb5178776186eb5a9976aed8b1ee869
|
||||||
|
2113 test_spider.py 300f280377b522737be0d8e4a80031ab118a4011bdbb92131e9c400fcdab6299
|
||||||
|
utils/
|
||||||
|
40 __init__.py f1237ae74693e2ec1b3154e57aec27438a80a735e5ccf2411aecd194ef443b6a
|
||||||
|
4047 configurator.py 8b566a0435a9f105a8ec616b16c3e21edb9b82f8debe1ef9f1df6bbbf20949d5
|
||||||
|
2537 sourceloader.py f5a5ac2a6aba0658dbe11361f465caabcf3c06c5c8dc9a631874211cc19d2d37
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Ignore
|
#### Ignore
|
||||||
|
15
fourmi.py
15
fourmi.py
@ -1,8 +1,9 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
"""
|
"""
|
||||||
Fourmi, a web scraper build to search specific information for a given compound (and it's pseudonyms).
|
Fourmi, a web scraper build to search specific information for a given compound (and its pseudonyms).
|
||||||
|
|
||||||
Usage:
|
Usage:
|
||||||
|
fourmi
|
||||||
fourmi search <compound>
|
fourmi search <compound>
|
||||||
fourmi [options] search <compound>
|
fourmi [options] search <compound>
|
||||||
fourmi [options] [-v | -vv | -vvv] [--include=<sourcename> | --exclude=<sourcename>] search <compound>
|
fourmi [options] [-v | -vv | -vvv] [--include=<sourcename> | --exclude=<sourcename>] search <compound>
|
||||||
@ -17,7 +18,7 @@ Options:
|
|||||||
--version Show version.
|
--version Show version.
|
||||||
-v Verbose logging output. (Multiple occurrences increase logging level)
|
-v Verbose logging output. (Multiple occurrences increase logging level)
|
||||||
--log=<file> Save log to an file.
|
--log=<file> Save log to an file.
|
||||||
-o <file> --output=<file> Output file [default: results.*format*]
|
-o <file> --output=<file> Output file [default: <compound>.*format*]
|
||||||
-f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: csv]
|
-f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: csv]
|
||||||
--include=<regex> Include only sources that match these regular expressions split by a comma.
|
--include=<regex> Include only sources that match these regular expressions split by a comma.
|
||||||
--exclude=<regex> Exclude the sources that match these regular expressions split by a comma.
|
--exclude=<regex> Exclude the sources that match these regular expressions split by a comma.
|
||||||
@ -31,6 +32,7 @@ import docopt
|
|||||||
from FourmiCrawler.spider import FourmiSpider
|
from FourmiCrawler.spider import FourmiSpider
|
||||||
from utils.configurator import Configurator
|
from utils.configurator import Configurator
|
||||||
from utils.sourceloader import SourceLoader
|
from utils.sourceloader import SourceLoader
|
||||||
|
from GUI import gui
|
||||||
|
|
||||||
|
|
||||||
def setup_crawler(compound, settings, source_loader, attributes):
|
def setup_crawler(compound, settings, source_loader, attributes):
|
||||||
@ -58,18 +60,18 @@ def search(docopt_arguments, source_loader):
|
|||||||
"""
|
"""
|
||||||
conf = Configurator()
|
conf = Configurator()
|
||||||
conf.set_logging(docopt_arguments["--log"], docopt_arguments["-v"])
|
conf.set_logging(docopt_arguments["--log"], docopt_arguments["-v"])
|
||||||
conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"])
|
conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"], docopt_arguments["<compound>"])
|
||||||
setup_crawler(docopt_arguments["<compound>"], conf.scrapy_settings,
|
setup_crawler(docopt_arguments["<compound>"], conf.scrapy_settings,
|
||||||
source_loader, docopt_arguments["--attributes"].split(','))
|
source_loader, docopt_arguments["--attributes"].split(','))
|
||||||
if conf.scrapy_settings.getbool("LOG_ENABLED"):
|
if conf.scrapy_settings.getbool("LOG_ENABLED"):
|
||||||
log.start(conf.scrapy_settings.get("LOG_FILE"),
|
log.start(conf.scrapy_settings.get("LOG_FILE"),
|
||||||
conf.scrapy_settings.get("LOG_LEVEL"), conf.scrapy_settings.get("LOG_STDOUT"))
|
conf.scrapy_settings.get("LOG_LEVEL"), conf.scrapy_settings.get("LOG_STDOUT"))
|
||||||
reactor.run()
|
reactor.run()
|
||||||
|
|
||||||
|
|
||||||
# The start for the Fourmi Command Line interface.
|
# The start for the Fourmi Command Line interface.
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
arguments = docopt.docopt(__doc__, version='Fourmi - V0.5.3')
|
arguments = docopt.docopt(__doc__, version='Fourmi - V0.6.0')
|
||||||
loader = SourceLoader()
|
loader = SourceLoader()
|
||||||
|
|
||||||
if arguments["--include"]:
|
if arguments["--include"]:
|
||||||
@ -82,3 +84,6 @@ if __name__ == '__main__':
|
|||||||
elif arguments["list"]:
|
elif arguments["list"]:
|
||||||
print "-== Available Sources ==-"
|
print "-== Available Sources ==-"
|
||||||
print str(loader)
|
print str(loader)
|
||||||
|
else:
|
||||||
|
gui_window = gui.GUI(search, sourceloader=SourceLoader())
|
||||||
|
gui_window.run()
|
||||||
|
19
sources.cfg.sample
Normal file
19
sources.cfg.sample
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
[DEFAULT]
|
||||||
|
reliability = Unknown
|
||||||
|
|
||||||
|
#For each source listed in FourmiCrawler/sources there should be a section
|
||||||
|
#named exactly as the filename in here. If not present, the DEFAULT value is
|
||||||
|
#used for reliability of that source.
|
||||||
|
|
||||||
|
[ChemSpider]
|
||||||
|
reliability = High
|
||||||
|
#token=Paste ChemSpider API token here and remove the hashtag
|
||||||
|
|
||||||
|
[NIST]
|
||||||
|
reliability = High
|
||||||
|
|
||||||
|
[WikipediaParser]
|
||||||
|
reliability = Medium
|
||||||
|
|
||||||
|
[PubChem]
|
||||||
|
reliability = High
|
@ -1 +1,6 @@
|
|||||||
|
import test_configurator
|
||||||
|
import test_gui
|
||||||
|
import test_pipeline
|
||||||
|
import test_sourceloader
|
||||||
|
import test_spider
|
||||||
|
|
||||||
|
@ -10,16 +10,16 @@ class TestConfigurator(unittest.TestCase):
|
|||||||
self.conf = Configurator()
|
self.conf = Configurator()
|
||||||
|
|
||||||
def test_set_output(self):
|
def test_set_output(self):
|
||||||
self.conf.set_output(filename="test.txt", fileformat="csv")
|
self.conf.set_output(filename="test.txt", fileformat="csv", compound="test")
|
||||||
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.txt")
|
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.txt")
|
||||||
self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv")
|
self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv")
|
||||||
|
|
||||||
self.conf.set_output("results.*format*", "jsonlines")
|
self.conf.set_output("<compound>.*format*", "jsonlines", "test")
|
||||||
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.json")
|
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.json")
|
||||||
self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "jsonlines")
|
self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "jsonlines")
|
||||||
|
|
||||||
self.conf.set_output("results.*format*", "csv")
|
self.conf.set_output("<compound>.*format*", "csv", "test")
|
||||||
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.csv")
|
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.csv")
|
||||||
self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv")
|
self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv")
|
||||||
|
|
||||||
def test_start_log(self):
|
def test_start_log(self):
|
||||||
|
32
tests/test_gui.py
Normal file
32
tests/test_gui.py
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
import unittest
|
||||||
|
|
||||||
|
from GUI import gui
|
||||||
|
|
||||||
|
class TestGUI(unittest.TestCase):
|
||||||
|
def setUp(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def test_empty_attributes(self):
|
||||||
|
self.test_gui = gui.GUI(None, config_file="../GUI.cfg.sample", in_source=True)
|
||||||
|
self.test_gui.window.after(9, self.test_gui.prepare_search)
|
||||||
|
self.test_gui.window.after(11, self.test_gui.window.destroy)
|
||||||
|
self.test_gui.run()
|
||||||
|
|
||||||
|
output_type = self.test_gui.configurator.load_output_types().split(',')[0]
|
||||||
|
|
||||||
|
self.assertEqual(self.test_gui.values.get('substance'), '')
|
||||||
|
self.assertEqual(self.test_gui.values.get('output_type'), output_type)
|
||||||
|
self.assertEqual(self.test_gui.values.get('output_name'), 'results.csv')
|
||||||
|
|
||||||
|
|
||||||
|
def test_no_configurations(self):
|
||||||
|
self.test_gui = gui.GUI(None, config_file="../GUI.cfg.sample")
|
||||||
|
self.test_gui.configurator = gui.ConfigImporter('')
|
||||||
|
self.test_gui.finish_with_search = True
|
||||||
|
self.test_gui.window.after(9, self.test_gui.prepare_search)
|
||||||
|
self.test_gui.window.after(11, self.test_gui.window.destroy)
|
||||||
|
self.test_gui.run()
|
||||||
|
|
||||||
|
self.assertEqual(self.test_gui.values.get('substance'), '')
|
||||||
|
self.assertEqual(self.test_gui.values.get('output_type'), 'csv')
|
||||||
|
self.assertEqual(self.test_gui.values.get('output_name'), 'results.csv')
|
@ -0,0 +1,2 @@
|
|||||||
|
import configurator
|
||||||
|
import sourceloader
|
@ -1,4 +1,6 @@
|
|||||||
import ConfigParser
|
import ConfigParser
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
|
||||||
from scrapy.utils.project import get_project_settings
|
from scrapy.utils.project import get_project_settings
|
||||||
|
|
||||||
@ -12,7 +14,7 @@ class Configurator:
|
|||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.scrapy_settings = get_project_settings()
|
self.scrapy_settings = get_project_settings()
|
||||||
|
|
||||||
def set_output(self, filename, fileformat):
|
def set_output(self, filename, fileformat, compound):
|
||||||
"""
|
"""
|
||||||
This function manipulates the Scrapy output file settings that normally would be set in the settings file.
|
This function manipulates the Scrapy output file settings that normally would be set in the settings file.
|
||||||
In the Fourmi project these are command line arguments.
|
In the Fourmi project these are command line arguments.
|
||||||
@ -20,12 +22,12 @@ class Configurator:
|
|||||||
:param fileformat: The format in which the output will be.
|
:param fileformat: The format in which the output will be.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if filename != 'results.*format*':
|
if filename != '<compound>.*format*':
|
||||||
self.scrapy_settings.overrides["FEED_URI"] = filename
|
self.scrapy_settings.overrides["FEED_URI"] = filename
|
||||||
elif fileformat == "jsonlines":
|
elif fileformat == "jsonlines":
|
||||||
self.scrapy_settings.overrides["FEED_URI"] = "results.json"
|
self.scrapy_settings.overrides["FEED_URI"] = compound + ".json"
|
||||||
elif fileformat is not None:
|
elif fileformat is not None:
|
||||||
self.scrapy_settings.overrides["FEED_URI"] = "results." + fileformat
|
self.scrapy_settings.overrides["FEED_URI"] = compound + "." + fileformat
|
||||||
|
|
||||||
if fileformat is not None:
|
if fileformat is not None:
|
||||||
self.scrapy_settings.overrides["FEED_FORMAT"] = fileformat
|
self.scrapy_settings.overrides["FEED_FORMAT"] = fileformat
|
||||||
@ -66,8 +68,16 @@ class Configurator:
|
|||||||
variables for sources
|
variables for sources
|
||||||
:return a ConfigParser object of sources.cfg
|
:return a ConfigParser object of sources.cfg
|
||||||
"""
|
"""
|
||||||
|
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
config_path = current_dir + '/../sources.cfg'
|
||||||
|
# [TODO]: location of sources.cfg should be softcoded eventually
|
||||||
|
if not os.path.isfile(config_path):
|
||||||
|
try:
|
||||||
|
shutil.copyfile(os.path.dirname(os.path.abspath(__file__)) + "/../sources.cfg.sample", config_path)
|
||||||
|
except IOError:
|
||||||
|
print "WARNING: Source configuration couldn't be found and couldn't be created."
|
||||||
config = ConfigParser.ConfigParser()
|
config = ConfigParser.ConfigParser()
|
||||||
config.read('sources.cfg') # [TODO]: should be softcoded eventually
|
config.read(config_path)
|
||||||
return config
|
return config
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
Reference in New Issue
Block a user