Merge branch 'release/v0.6.0'
This commit is contained in:
commit
50e6835116
2
.gitignore
vendored
2
.gitignore
vendored
@ -6,6 +6,8 @@
|
||||
|
||||
#may contain authentication information
|
||||
sources.cfg
|
||||
#Another of our config files
|
||||
GUI.cfg
|
||||
|
||||
#THINGS WE WOULD NEVER EVER WANT!
|
||||
#ignore thumbnails created by windows
|
||||
|
@ -3,6 +3,10 @@
|
||||
language: python
|
||||
python: 2.7
|
||||
|
||||
before_install:
|
||||
- "export DISPLAY=:99.0"
|
||||
- "sh -e /etc/init.d/xvfb start"
|
||||
|
||||
# command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors
|
||||
install:
|
||||
- pip install Scrapy docopt
|
||||
@ -10,10 +14,10 @@ install:
|
||||
|
||||
# command to run tests, e.g. python setup.py test
|
||||
script:
|
||||
- nosetests --with-coverage --cover-package=FourmiCrawler,utils tests
|
||||
- nosetests --with-coverage --cover-package=FourmiCrawler,utils,GUI tests
|
||||
|
||||
notifications:
|
||||
slack: descartes2:6sgCzx3PvrO9IIMwKxj12dDM
|
||||
|
||||
after_success:
|
||||
coveralls --verbose
|
||||
coveralls --verbose
|
||||
|
@ -1,3 +1,11 @@
|
||||
### v0.6.0
|
||||
- Feature: Added a Graphical User interface
|
||||
- Feature: Automatic config file createion from config samples
|
||||
- FIX: The default name of the output files will now consist of the compound name and the file format when using the CLI
|
||||
- FIX: A lot of bugfixes of the PubChem plugin, as is wasn't working as it should
|
||||
- FIX: Using absolute path for configuration files
|
||||
- DEV: General Code cleanup in documentation
|
||||
|
||||
### v0.5.3
|
||||
- FIX: It is now again possible to use both verbose and the source inclusion/exclusion options
|
||||
- FIX: Logging is now "actually" disabled if not using the verbose option.
|
||||
|
@ -21,7 +21,4 @@ FEED_FORMAT = 'jsonlines'
|
||||
# Crawl responsibly by identifying yourself (and your website) on the
|
||||
# user-agent
|
||||
|
||||
# [todo] - Check for repercussions on spoofing the user agent
|
||||
|
||||
# USER_AGENT = 'FourmiCrawler (+http://www.yourdomain.com)'
|
||||
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36'
|
||||
USER_AGENT = 'Fourmi'
|
||||
|
@ -9,24 +9,28 @@ from FourmiCrawler.items import Result
|
||||
|
||||
|
||||
# [TODO] - Maybe clean up usage of '.extract()[0]', because of possible IndexError exception.
|
||||
# [TODO] - Add checks at search request and extendedCompoundInfo on whether the token was valid or not
|
||||
|
||||
class ChemSpider(Source):
|
||||
"""ChemSpider scraper for synonyms and properties
|
||||
|
||||
"""
|
||||
ChemSpider scraper for synonyms and properties
|
||||
This parser will manage searching for chemicals through the
|
||||
ChemsSpider API, and parsing the resulting ChemSpider page.
|
||||
The token required for the API should be in a configuration file
|
||||
somewhere.
|
||||
"""
|
||||
|
||||
website = 'http://www.chemspider.com/*'
|
||||
website = 'http://www\\.chemspider\\.com/.*'
|
||||
|
||||
search = 'Search.asmx/SimpleSearch?query=%s&token='
|
||||
structure = 'Chemical-Structure.%s.html'
|
||||
extendedinfo = 'MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token='
|
||||
|
||||
def __init__(self, config=None):
|
||||
"""
|
||||
Initialization of ChemSpider scraper
|
||||
:param config: a dictionary of settings for this scraper, must contain
|
||||
'reliability' key
|
||||
"""
|
||||
Source.__init__(self, config)
|
||||
self.ignore_list = []
|
||||
if 'token' not in self.cfg or self.cfg['token'] == '':
|
||||
@ -37,6 +41,12 @@ class ChemSpider(Source):
|
||||
self.extendedinfo += self.cfg['token']
|
||||
|
||||
def parse(self, response):
|
||||
"""
|
||||
This function is called when a Response matching the variable
|
||||
'website' is available for parsing the Response object.
|
||||
:param response: the Scrapy Response object to be parsed
|
||||
:return: a list of Result items and Request objects
|
||||
"""
|
||||
sel = Selector(response)
|
||||
requests = []
|
||||
requests_synonyms = self.parse_synonyms(sel)
|
||||
@ -47,10 +57,26 @@ class ChemSpider(Source):
|
||||
return requests
|
||||
|
||||
def parse_properties(self, sel):
|
||||
"""scrape Experimental Data and Predicted ACD/Labs tabs"""
|
||||
"""
|
||||
This function scrapes the Experimental Data and Predicted ACD/Labs tabs
|
||||
:param sel: a Selector object of the whole page
|
||||
:return: a list of Result items
|
||||
"""
|
||||
properties = []
|
||||
|
||||
properties.extend(self.parse_acdlabstab(sel))
|
||||
properties.extend(self.parse_experimentaldatatab(sel))
|
||||
|
||||
return properties
|
||||
|
||||
def parse_acdlabstab(self, sel):
|
||||
"""
|
||||
This function scrapes the 'Predicted ACD/Labs tab' under Properties
|
||||
:param sel: a Selector object of the whole page
|
||||
:return: a list of Request objects
|
||||
"""
|
||||
properties = []
|
||||
|
||||
# Predicted - ACD/Labs tab
|
||||
td_list = sel.xpath('.//table[@id="acdlabs-table"]//td').xpath(
|
||||
'normalize-space(string())')
|
||||
prop_names = td_list[::2]
|
||||
@ -62,16 +88,15 @@ class ChemSpider(Source):
|
||||
prop_conditions = ''
|
||||
|
||||
# Test for properties without values, with one hardcoded exception
|
||||
if not re.match(r'^\d', prop_value) or (prop_name == 'Polarizability' and prop_value == '10-24cm3'):
|
||||
if (not re.match(r'^\d', prop_value) or
|
||||
(prop_name == 'Polarizability' and prop_value == '10-24cm3')):
|
||||
continue
|
||||
|
||||
# Match for condition in parentheses
|
||||
m = re.match(r'(.*) \((.*)\)', prop_name)
|
||||
if m:
|
||||
prop_name = m.group(1)
|
||||
prop_conditions = m.group(2)
|
||||
|
||||
# Match for condition in value seperated by an 'at'
|
||||
m = re.match(r'(.*) at (.*)', prop_value)
|
||||
if m:
|
||||
prop_value = m.group(1)
|
||||
@ -84,11 +109,18 @@ class ChemSpider(Source):
|
||||
conditions=prop_conditions
|
||||
)
|
||||
properties.append(new_prop)
|
||||
log.msg('CS prop: |%s| |%s| |%s|' %
|
||||
(new_prop['attribute'], new_prop['value'], new_prop['source']),
|
||||
level=log.DEBUG)
|
||||
|
||||
# Experimental Data Tab, Physico-chemical properties in particular
|
||||
return properties
|
||||
|
||||
def parse_experimentaldatatab(self, sel):
|
||||
"""
|
||||
This function scrapes Experimental Data tab, Physico-chemical
|
||||
properties in particular.
|
||||
:param sel: a Selector object of the whole page
|
||||
:return: a list of Result items
|
||||
"""
|
||||
properties = []
|
||||
|
||||
scraped_list = sel.xpath('.//li[span="Experimental Physico-chemical '
|
||||
'Properties"]//li/table/tr/td')
|
||||
if not scraped_list:
|
||||
@ -105,15 +137,16 @@ class ChemSpider(Source):
|
||||
value=line.xpath('text()').extract()[0].rstrip(),
|
||||
source=line.xpath('strong/text()').extract()[0].rstrip(),
|
||||
)
|
||||
properties.append(new_prop)
|
||||
log.msg('CS prop: |%s| |%s| |%s|' %
|
||||
(new_prop['attribute'], new_prop['value'],
|
||||
new_prop['source']), level=log.DEBUG)
|
||||
properties.append(new_prop)
|
||||
|
||||
return properties
|
||||
|
||||
def parse_synonyms(self, sel):
|
||||
"""Scrape list of Names and Identifiers"""
|
||||
"""
|
||||
This function scrapes the list of Names and Identifiers
|
||||
:param sel: a Selector object of the whole page
|
||||
:return: a list of Requests
|
||||
"""
|
||||
requests = []
|
||||
synonyms = []
|
||||
|
||||
@ -145,7 +178,13 @@ class ChemSpider(Source):
|
||||
return requests
|
||||
|
||||
def new_synonym(self, sel, name, category):
|
||||
"""Scrape for a single synonym at a given HTML tag"""
|
||||
"""
|
||||
This function scrapes for a single synonym at a given HTML tag
|
||||
:param sel: a Selector object of the given HTML tag
|
||||
:param name: the name of the synonym in the tag
|
||||
:param category: the name of the category the synonym is labeled as
|
||||
:return: a dictionary containing data on the synonym
|
||||
"""
|
||||
self.ignore_list.append(name)
|
||||
language = sel.xpath('span[@class="synonym_language"]/text()')
|
||||
if language:
|
||||
@ -181,7 +220,12 @@ class ChemSpider(Source):
|
||||
return synonym
|
||||
|
||||
def parse_extendedinfo(self, response):
|
||||
"""Scrape data from the ChemSpider GetExtendedCompoundInfo API"""
|
||||
"""
|
||||
This function scrapes data from the ChemSpider GetExtendedCompoundInfo
|
||||
API, if a token is present in the configuration settings
|
||||
:param response: a Response object to be parsed
|
||||
:return: a list of Result items
|
||||
"""
|
||||
sel = Selector(response)
|
||||
properties = []
|
||||
names = sel.xpath('*').xpath('name()').extract()
|
||||
@ -197,17 +241,31 @@ class ChemSpider(Source):
|
||||
return properties
|
||||
|
||||
def newresult(self, attribute, value, conditions='', source='ChemSpider'):
|
||||
return Result(
|
||||
{
|
||||
'attribute': attribute,
|
||||
'value': value,
|
||||
'source': source,
|
||||
'reliability': self.cfg['reliability'],
|
||||
'conditions': conditions
|
||||
})
|
||||
"""
|
||||
This function abstracts from the Result item and provides default
|
||||
values.
|
||||
:param attribute: the name of the attribute
|
||||
:param value: the value of the attribute
|
||||
:param conditions: optional conditions regarding the value
|
||||
:param source: the name of the source if it is not ChemSpider
|
||||
:return: A Result item
|
||||
"""
|
||||
return Result({
|
||||
'attribute': attribute,
|
||||
'value': value,
|
||||
'source': source,
|
||||
'reliability': self.cfg['reliability'],
|
||||
'conditions': conditions
|
||||
})
|
||||
|
||||
def parse_searchrequest(self, response):
|
||||
"""Parse the initial response of the ChemSpider Search API """
|
||||
"""
|
||||
This function parses the initial response of the ChemSpider Search API
|
||||
Requires a valid token to function.
|
||||
:param response: the Response object to be parsed
|
||||
:return: A Request for the information page and a Request for the
|
||||
extendedinfo API call
|
||||
"""
|
||||
sel = Selector(response)
|
||||
log.msg('chemspider parse_searchrequest', level=log.DEBUG)
|
||||
sel.register_namespace('cs', 'http://www.chemspider.com/')
|
||||
@ -219,8 +277,8 @@ class ChemSpider(Source):
|
||||
log.msg('ChemSpider found multiple substances, taking first '
|
||||
'element', level=log.DEBUG)
|
||||
csid = csids[0]
|
||||
structure_url = self.website[:-1] + self.structure % csid
|
||||
extendedinfo_url = self.website[:-1] + self.extendedinfo % csid
|
||||
structure_url = self.website[:-2].replace("\\", "") + self.structure % csid
|
||||
extendedinfo_url = self.website[:-2].replace("\\", "") + self.extendedinfo % csid
|
||||
log.msg('chemspider URL: %s' % structure_url, level=log.DEBUG)
|
||||
return [Request(url=structure_url,
|
||||
callback=self.parse),
|
||||
@ -228,8 +286,13 @@ class ChemSpider(Source):
|
||||
callback=self.parse_extendedinfo)]
|
||||
|
||||
def new_compound_request(self, compound):
|
||||
"""
|
||||
This function is called when a new synonym is returned to the spider
|
||||
to generate new requests
|
||||
:param compound: the name of the compound to search for
|
||||
"""
|
||||
if compound in self.ignore_list or self.cfg['token'] == '':
|
||||
return None
|
||||
searchurl = self.website[:-1] + self.search % compound
|
||||
searchurl = self.website[:-2].replace("\\", "") + self.search % compound
|
||||
log.msg('chemspider compound', level=log.DEBUG)
|
||||
return Request(url=searchurl, callback=self.parse_searchrequest)
|
||||
|
@ -13,20 +13,31 @@ from FourmiCrawler.items import Result
|
||||
# Result item, but should be included eventually.
|
||||
|
||||
class NIST(Source):
|
||||
"""NIST Scraper plugin
|
||||
|
||||
"""
|
||||
NIST Scraper plugin
|
||||
This plugin manages searching for a chemical on the NIST website
|
||||
and parsing the resulting page if the chemical exists on NIST.
|
||||
"""
|
||||
website = "http://webbook.nist.gov/*"
|
||||
website = "http://webbook\\.nist\\.gov/.*"
|
||||
|
||||
search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on'
|
||||
|
||||
def __init__(self, config=None):
|
||||
"""
|
||||
Initialization of NIST scraper
|
||||
:param config: configuration variables for this scraper, must contain
|
||||
'reliability' key.
|
||||
"""
|
||||
Source.__init__(self, config)
|
||||
self.ignore_list = set()
|
||||
|
||||
def parse(self, response):
|
||||
"""
|
||||
This function is called when a Response matching the variable
|
||||
'website' is available for parsing the Response object.
|
||||
:param response: The Scrapy Response object to be parsed
|
||||
:return: a list of Result items and Request objects
|
||||
"""
|
||||
sel = Selector(response)
|
||||
|
||||
title = sel.xpath('head/title/text()').extract()[0]
|
||||
@ -51,6 +62,21 @@ class NIST(Source):
|
||||
log.msg('NIST symbol: |%s|, name: |%s|' % (symbol, name),
|
||||
level=log.DEBUG)
|
||||
|
||||
requests.extend(self.parse_tables(sel, symbol_table))
|
||||
|
||||
return requests
|
||||
|
||||
def parse_tables(self, sel, symbol_table):
|
||||
"""
|
||||
This function identifies and distributes parsing of tables to other
|
||||
functions below.
|
||||
:param sel: A Selector object of the whole page
|
||||
:param symbol_table: a dictionary containing translations of raw HTML
|
||||
tags to human readable names
|
||||
:return: a list of Result items and Requests
|
||||
"""
|
||||
requests = []
|
||||
|
||||
for table in sel.xpath('//table[@class="data"]'):
|
||||
summary = table.xpath('@summary').extract()[0]
|
||||
if summary == 'One dimensional data':
|
||||
@ -81,8 +107,12 @@ class NIST(Source):
|
||||
return requests
|
||||
|
||||
def parse_generic_info(self, sel):
|
||||
"""Parses: synonyms, chemical formula, molecular weight, InChI,
|
||||
InChiKey, CAS number
|
||||
"""
|
||||
This function parses: synonyms, chemical formula, molecular weight,
|
||||
InChI, InChiKey, CAS number
|
||||
:param sel: A Selector object of the entire page in the original
|
||||
response
|
||||
:return: a list of Result items
|
||||
"""
|
||||
ul = sel.xpath('body/ul[li/strong="IUPAC Standard InChI:"]')
|
||||
|
||||
@ -121,15 +151,20 @@ class NIST(Source):
|
||||
return requests
|
||||
|
||||
def parse_aggregate_data(self, table, symbol_table):
|
||||
"""Parses the table(s) which contain possible links to individual
|
||||
data points
|
||||
"""
|
||||
This function parses the table(s) which contain possible links to
|
||||
individual data points
|
||||
:param table: a Selector object of the table to be parsed
|
||||
:param symbol_table: a dictionary containing translations of raw HTML
|
||||
tags to human readable names
|
||||
:return: a list of Result items and Request objects
|
||||
"""
|
||||
results = []
|
||||
for tr in table.xpath('tr[td]'):
|
||||
extra_data_url = tr.xpath('td[last()][a="Individual data points"]'
|
||||
'/a/@href').extract()
|
||||
if extra_data_url:
|
||||
request = Request(url=self.website[:-1] + extra_data_url[0],
|
||||
request = Request(url=self.website[:-2].replace("\\", "") + extra_data_url[0],
|
||||
callback=self.parse_individual_datapoints)
|
||||
results.append(request)
|
||||
continue
|
||||
@ -155,14 +190,16 @@ class NIST(Source):
|
||||
return results
|
||||
|
||||
def parse_transition_data(self, table, summary):
|
||||
"""Parses the table containing properties regarding phase changes"""
|
||||
"""
|
||||
This function parses the table containing properties regarding phase
|
||||
changes
|
||||
:param table: a Selector object of the table to be parsed
|
||||
:param summary: the name of the property
|
||||
:return: a list of Result items
|
||||
"""
|
||||
results = []
|
||||
|
||||
tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract())
|
||||
m = re.search(r'\((.*)\)', tr_unit)
|
||||
unit = '!'
|
||||
if m:
|
||||
unit = m.group(1)
|
||||
unit = self.get_unit(table)
|
||||
|
||||
for tr in table.xpath('tr[td]'):
|
||||
tds = tr.xpath('td/text()').extract()
|
||||
@ -176,18 +213,18 @@ class NIST(Source):
|
||||
return results
|
||||
|
||||
def parse_generic_data(self, table, summary):
|
||||
"""Parses the common tables of 4 and 5 rows. Assumes they are of the
|
||||
"""
|
||||
Parses the common tables of 4 and 5 rows. Assumes they are of the
|
||||
form:
|
||||
Symbol (unit)|Temperature (K)|Method|Reference|Comment
|
||||
Symbol (unit)|Temperature (K)|Reference|Comment
|
||||
:param table: a Selector object of the table to be parsed
|
||||
:param summary: the name of the property
|
||||
:return: a list of Result items
|
||||
"""
|
||||
results = []
|
||||
|
||||
tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract())
|
||||
m = re.search(r'\((.*)\)', tr_unit)
|
||||
unit = '!'
|
||||
if m:
|
||||
unit = m.group(1)
|
||||
unit = self.get_unit(table)
|
||||
|
||||
for tr in table.xpath('tr[td]'):
|
||||
tds = tr.xpath('td/text()').extract()
|
||||
@ -200,7 +237,13 @@ class NIST(Source):
|
||||
return results
|
||||
|
||||
def parse_antoine_data(self, table, summary):
|
||||
"""Parse table containing parameters for the Antione equation"""
|
||||
"""
|
||||
This function parses the table containing parameters for the Antione
|
||||
equation
|
||||
:param table: a Selector object of the table to be parsed
|
||||
:param summary: the name of the property
|
||||
:return: a list of Result items
|
||||
"""
|
||||
results = []
|
||||
|
||||
for tr in table.xpath('tr[td]'):
|
||||
@ -215,7 +258,12 @@ class NIST(Source):
|
||||
return results
|
||||
|
||||
def parse_individual_datapoints(self, response):
|
||||
"""Parses the page linked from aggregate data"""
|
||||
"""
|
||||
This function parses the 'individual data points' page linked from
|
||||
the aggregate data table(s)
|
||||
:param response: the Scrapy Response object to be parsed
|
||||
:return: a list of Result items
|
||||
"""
|
||||
sel = Selector(response)
|
||||
table = sel.xpath('//table[@class="data"]')[0]
|
||||
|
||||
@ -228,11 +276,7 @@ class NIST(Source):
|
||||
name = m.group(1)
|
||||
condition = m.group(2)
|
||||
|
||||
tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract())
|
||||
m = re.search(r'\((.*)\)', tr_unit)
|
||||
unit = '!'
|
||||
if m:
|
||||
unit = m.group(1)
|
||||
unit = self.get_unit(table)
|
||||
|
||||
for tr in table.xpath('tr[td]'):
|
||||
tds = tr.xpath('td/text()').extract()
|
||||
@ -250,7 +294,25 @@ class NIST(Source):
|
||||
|
||||
return results
|
||||
|
||||
@staticmethod
|
||||
def get_unit(table):
|
||||
tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract())
|
||||
m = re.search(r'\((.*)\)', tr_unit)
|
||||
unit = '!'
|
||||
if m:
|
||||
unit = m.group(1)
|
||||
|
||||
return unit
|
||||
|
||||
def newresult(self, attribute, value, conditions=''):
|
||||
"""
|
||||
This function abstracts from the Result item and provides default
|
||||
values
|
||||
:param attribute: the name of the attribute
|
||||
:param value: the value of the attribute
|
||||
:param conditions: optional conditions regarding the value
|
||||
:return: A Result item
|
||||
"""
|
||||
return Result(
|
||||
{
|
||||
'attribute': attribute,
|
||||
@ -261,7 +323,12 @@ class NIST(Source):
|
||||
})
|
||||
|
||||
def new_compound_request(self, compound):
|
||||
"""
|
||||
This function is called when a new synonym is returned to the spider
|
||||
to generate new requests
|
||||
:param compound: the name of the compound to search for
|
||||
"""
|
||||
if compound not in self.ignore_list:
|
||||
self.ignore_list.update(compound)
|
||||
return Request(url=self.website[:-1] + self.search % compound,
|
||||
return Request(url=self.website[:-2].replace("\\", "") + self.search % compound,
|
||||
callback=self.parse)
|
||||
|
@ -1,9 +1,11 @@
|
||||
import re
|
||||
|
||||
from scrapy.http import Request
|
||||
from scrapy import log
|
||||
from source import Source
|
||||
from scrapy.selector import Selector
|
||||
|
||||
from source import Source
|
||||
from FourmiCrawler.items import Result
|
||||
import re
|
||||
|
||||
|
||||
class PubChem(Source):
|
||||
@ -13,10 +15,10 @@ class PubChem(Source):
|
||||
including sources of the values of properties.
|
||||
"""
|
||||
|
||||
#PubChem has its data on compound name, properties and their values on different html pages, so different URLs used
|
||||
website = 'https://*.ncbi.nlm.nih.gov/*'
|
||||
website_www = 'https://www.ncbi.nlm.nih.gov/*'
|
||||
website_pubchem = 'https://pubchem.ncbi.nlm.nih.gov/*'
|
||||
# PubChem has its data on compound name, properties and their values on different html pages, so different URLs used
|
||||
website = 'http://.*\\.ncbi\\.nlm\\.nih\\.gov/.*'
|
||||
website_www = 'http://www.ncbi.nlm.nih.gov/*'
|
||||
website_pubchem = 'http://pubchem.ncbi.nlm.nih.gov/.*'
|
||||
search = 'pccompound?term=%s'
|
||||
data_url = 'toc/summary_toc.cgi?tocid=27&cid=%s'
|
||||
|
||||
@ -49,14 +51,15 @@ class PubChem(Source):
|
||||
self._spider.get_synonym_requests(synonym)
|
||||
log.msg('Raw synonyms found: %s' % raw_synonyms, level=log.DEBUG)
|
||||
|
||||
n = re.search(r'cid=(\d+)',response.url)
|
||||
n = re.search(r'cid=(\d+)', response.url)
|
||||
if n:
|
||||
cid = n.group(1)
|
||||
log.msg('cid: %s' % cid, level=log.DEBUG) #getting the right id of the compound with which it can reach
|
||||
# the seperate html page which contains the properties and their values
|
||||
log.msg('cid: %s' % cid, level=log.DEBUG) # getting the right id of the compound with which it can reach
|
||||
# the seperate html page which contains the properties and their values
|
||||
|
||||
#using this cid to get the right url and scrape it
|
||||
requests.append(Request(url=self.website_pubchem[:-1] + self.data_url % cid, callback=self.parse_data))
|
||||
# using this cid to get the right url and scrape it
|
||||
requests.append(
|
||||
Request(url=self.website_pubchem[:-2].replace("\\", "") + self.data_url % cid, callback=self.parse_data))
|
||||
return requests
|
||||
|
||||
def parse_data(self, response):
|
||||
@ -72,22 +75,22 @@ class PubChem(Source):
|
||||
props = sel.xpath('//div')
|
||||
|
||||
for prop in props:
|
||||
prop_name = ''.join(prop.xpath('b/text()').extract()) # name of property that it is parsing
|
||||
if prop.xpath('a'): # parsing for single value in property
|
||||
prop_name = ''.join(prop.xpath('b/text()').extract()) # name of property that it is parsing
|
||||
if prop.xpath('a'): # parsing for single value in property
|
||||
prop_source = ''.join(prop.xpath('a/@title').extract())
|
||||
prop_value = ''.join(prop.xpath('a/text()').extract())
|
||||
new_prop = Result({
|
||||
'attribute': prop_name,
|
||||
'value': prop_value,
|
||||
'source': prop_source,
|
||||
'reliability': 'Unknown',
|
||||
'reliability': self.cfg['reliability'],
|
||||
'conditions': ''
|
||||
})
|
||||
log.msg('PubChem prop: |%s| |%s| |%s|' %
|
||||
(new_prop['attribute'], new_prop['value'],
|
||||
new_prop['source']), level=log.DEBUG)
|
||||
requests.append(new_prop)
|
||||
elif prop.xpath('ul'): # parsing for multiple values (list) in property
|
||||
elif prop.xpath('ul'): # parsing for multiple values (list) in property
|
||||
prop_values = prop.xpath('ul//li')
|
||||
for prop_li in prop_values:
|
||||
prop_value = ''.join(prop_li.xpath('a/text()').extract())
|
||||
@ -96,16 +99,51 @@ class PubChem(Source):
|
||||
'attribute': prop_name,
|
||||
'value': prop_value,
|
||||
'source': prop_source,
|
||||
'reliability': 'Unknown',
|
||||
'reliability': self.cfg['reliability'],
|
||||
'conditions': ''
|
||||
})
|
||||
log.msg('PubChem prop: |%s| |%s| |%s|' %
|
||||
(new_prop['attribute'], new_prop['value'],
|
||||
new_prop['source']), level=log.DEBUG)
|
||||
(new_prop['attribute'], new_prop['value'],
|
||||
new_prop['source']), level=log.DEBUG)
|
||||
requests.append(new_prop)
|
||||
|
||||
return requests
|
||||
|
||||
def parse_searchrequest(self, response):
|
||||
"""
|
||||
This function parses the response to the new_compound_request Request
|
||||
:param response: the Response object to be parsed
|
||||
:return: A Request for the compound page or what self.parse returns in
|
||||
case the search request forwarded to the compound page
|
||||
"""
|
||||
|
||||
# check if pubchem forwarded straight to compound page
|
||||
m = re.match(self.website_pubchem, response.url)
|
||||
if m:
|
||||
log.msg('PubChem search forwarded to compound page',
|
||||
level=log.DEBUG)
|
||||
return self.parse(response)
|
||||
|
||||
sel = Selector(response)
|
||||
|
||||
results = sel.xpath('//div[@class="rsltcont"]')
|
||||
if results:
|
||||
url = results[0].xpath('div/p/a[1]/@href')
|
||||
else:
|
||||
log.msg('PubChem search found nothing or xpath failed',
|
||||
level=log.DEBUG)
|
||||
return None
|
||||
|
||||
if url:
|
||||
url = 'http:' + ''.join(url[0].extract())
|
||||
log.msg('PubChem compound page: %s' % url, level=log.DEBUG)
|
||||
else:
|
||||
log.msg('PubChem search found results, but no url in first result',
|
||||
level=log.DEBUG)
|
||||
return None
|
||||
|
||||
return Request(url=url, callback=self.parse)
|
||||
|
||||
def new_compound_request(self, compound):
|
||||
return Request(url=self.website_www[:-1] + self.search % compound, callback=self.parse)
|
||||
return Request(url=self.website_www[:-1] + self.search % compound,
|
||||
callback=self.parse_searchrequest)
|
||||
|
@ -15,7 +15,7 @@ class WikipediaParser(Source):
|
||||
It also returns requests with other external sources which contain information on parsed subject.
|
||||
"""
|
||||
|
||||
website = "http://en.wikipedia.org/wiki/*"
|
||||
website = "http://en\\.wikipedia\\.org/wiki/.*"
|
||||
__spider = None
|
||||
searched_compounds = []
|
||||
|
||||
@ -123,7 +123,7 @@ class WikipediaParser(Source):
|
||||
return items
|
||||
|
||||
def new_compound_request(self, compound):
|
||||
return Request(url=self.website[:-1] + compound, callback=self.parse)
|
||||
return Request(url=self.website[:-2].replace("\\", "") + compound, callback=self.parse)
|
||||
|
||||
@staticmethod
|
||||
def clean_items(items):
|
||||
|
@ -3,7 +3,7 @@ from scrapy import log
|
||||
|
||||
|
||||
class Source:
|
||||
website = "http://something/*" # Regex of URI's the source is able to parse
|
||||
website = "http://something/.*" # Regex of URI's the source is able to parse
|
||||
_spider = None
|
||||
|
||||
def __init__(self, config=None):
|
||||
@ -30,7 +30,7 @@ class Source:
|
||||
:param compound: A compound name.
|
||||
:return: A new Scrapy Request
|
||||
"""
|
||||
# return Request(url=self.website[:-1] + compound, callback=self.parse)
|
||||
# return Request(url=self.website[:-2].replace("\\", "") + compound, callback=self.parse)
|
||||
pass
|
||||
|
||||
def set_spider(self, spider):
|
||||
|
@ -34,8 +34,9 @@ class FourmiSpider(Spider):
|
||||
"""
|
||||
for source in self._sources:
|
||||
if re.match(source.website, response.url):
|
||||
log.msg("Url: " + response.url + " -> Source: " + source.website, level=log.DEBUG)
|
||||
log.msg("URL: " + response.url + " -> Source: " + source.website, level=log.DEBUG)
|
||||
return source.parse(response)
|
||||
log.msg("URL: " + response.url + " -> No compatible source", level=log.INFO)
|
||||
return None
|
||||
|
||||
def get_synonym_requests(self, compound, force=False):
|
||||
|
10
GUI.cfg.sample
Normal file
10
GUI.cfg.sample
Normal file
@ -0,0 +1,10 @@
|
||||
[GUI]
|
||||
# Personalize options in your User Interface
|
||||
|
||||
# Commonly used parameters are listed in the GUI for easy selection
|
||||
CommonParameters = Weight, Polarity, Viscosity, Solubility, Name
|
||||
|
||||
# Parameters that are always used in the search
|
||||
AlwaysParameters = Name
|
||||
|
||||
OutputTypes = csv, json, jsonlines, xml
|
1
GUI/__init__.py
Normal file
1
GUI/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
import gui
|
30
GUI/configImporter.py
Normal file
30
GUI/configImporter.py
Normal file
@ -0,0 +1,30 @@
|
||||
import ConfigParser
|
||||
|
||||
|
||||
class ConfigImporter():
|
||||
def __init__(self, filename):
|
||||
"""Read the filename into the parser."""
|
||||
self.filename = filename
|
||||
self.parser = ConfigParser.ConfigParser()
|
||||
self.parser.read(self.filename)
|
||||
|
||||
def load_common_attributes(self):
|
||||
"""Loads common attributes from the initialized file."""
|
||||
try:
|
||||
return self.parser.get('GUI', 'CommonParameters')
|
||||
except:
|
||||
return 'One, Two, Three'
|
||||
|
||||
def load_output_types(self):
|
||||
"""Loads output types from the initialized file."""
|
||||
try:
|
||||
return self.parser.get('GUI', 'OutputTypes')
|
||||
except:
|
||||
return 'csv'
|
||||
|
||||
def load_always_attributes(self):
|
||||
"""Loads attributes that are always searched for from the initialized file."""
|
||||
try:
|
||||
return self.parser.get('GUI', 'AlwaysParameters')
|
||||
except:
|
||||
return 'Name, Weight'
|
196
GUI/gui.py
Normal file
196
GUI/gui.py
Normal file
@ -0,0 +1,196 @@
|
||||
from Tkinter import *
|
||||
import os
|
||||
import shutil
|
||||
from tkFileDialog import asksaveasfilename
|
||||
|
||||
from configImporter import *
|
||||
|
||||
|
||||
class GUI():
|
||||
def __init__(self, search, config_file='GUI.cfg', sourceloader=None, in_source=True):
|
||||
"""Boots the window, configuration."""
|
||||
if not in_source:
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
config_file = current_dir + '../' + config_file
|
||||
if not os.path.isfile(config_file):
|
||||
try:
|
||||
shutil.copyfile(os.path.dirname(os.path.abspath(__file__)) + "/../GUI.cfg.sample", config_file)
|
||||
except IOError:
|
||||
print "GUI configuration couldn't be found and couldn't be created."
|
||||
sys.exit()
|
||||
self.configurator = ConfigImporter(config_file)
|
||||
self.sourceloader = sourceloader
|
||||
self.finish_with_search = False
|
||||
self.values = {}
|
||||
self.required_variables = ['substance']
|
||||
self.search = search
|
||||
self.window, self.variables = self.generate_window(self.load_common_attributes(), self.load_output_types())
|
||||
|
||||
def load_common_attributes(self):
|
||||
"""Calls the configuration parser for common attributes."""
|
||||
return [x.strip() for x in self.configurator.load_common_attributes().split(',')]
|
||||
|
||||
def load_output_types(self):
|
||||
"""Calls the configuration parser for output types."""
|
||||
return [x.strip() for x in self.configurator.load_output_types().split(',')]
|
||||
|
||||
def load_always_attributes(self):
|
||||
"""Calls the configuration parser for attributes that are always used."""
|
||||
return ','.join([x.strip() for x in self.configurator.load_always_attributes().split(',')])
|
||||
|
||||
def set_output(self):
|
||||
self.variable_output_name.set(asksaveasfilename())
|
||||
self.button_output_name.config(text=self.variable_output_name.get())
|
||||
|
||||
def generate_window(self, common_attributes, output_types):
|
||||
"""Creates all widgets and variables in the window."""
|
||||
window = Tk()
|
||||
window.wm_title("Fourmi Crawler")
|
||||
|
||||
variables = {}
|
||||
|
||||
variable_substance = StringVar(window)
|
||||
frame_substance = Frame(window)
|
||||
label_substance = Label(frame_substance, text="Substance: ")
|
||||
input_substance = Entry(frame_substance, font=("Helvetica", 12), width=25, textvariable=variable_substance)
|
||||
variables.update({"substance": variable_substance})
|
||||
frame_substance.pack(side=TOP)
|
||||
label_substance.pack()
|
||||
input_substance.pack()
|
||||
input_substance.focus()
|
||||
|
||||
frame_all_attributes = Frame(window)
|
||||
frame_selecting_attributes = Frame(frame_all_attributes)
|
||||
frame_new_attributes = Frame(frame_selecting_attributes)
|
||||
label_new_attributes = Label(frame_new_attributes, text="Parameters: ")
|
||||
input_new_attributes = Text(frame_new_attributes, font=("Helvetica", 8), width=25, height=7, padx=5, pady=5)
|
||||
variables.update({"new_attributes": input_new_attributes})
|
||||
frame_new_attributes.pack(side=LEFT)
|
||||
label_new_attributes.pack()
|
||||
input_new_attributes.pack()
|
||||
|
||||
frame_common_attributes = Frame(frame_selecting_attributes)
|
||||
label_common_attributes = Label(frame_common_attributes, text="Common Parameters: ")
|
||||
input_common_attributes = Listbox(frame_common_attributes, selectmode=MULTIPLE, height=7)
|
||||
scrollbar_common_attributes = Scrollbar(frame_common_attributes)
|
||||
input_common_attributes.config(yscrollcommand=scrollbar_common_attributes.set)
|
||||
scrollbar_common_attributes.config(command=input_common_attributes.yview)
|
||||
if common_attributes and len(common_attributes) > 0:
|
||||
input_common_attributes.insert(END, *common_attributes)
|
||||
variables.update({"common_attributes": input_common_attributes})
|
||||
frame_common_attributes.pack(side=RIGHT)
|
||||
label_common_attributes.pack(side=TOP)
|
||||
input_common_attributes.pack(side=LEFT)
|
||||
scrollbar_common_attributes.pack(side=RIGHT, fill=Y)
|
||||
frame_selecting_attributes.pack()
|
||||
|
||||
frame_last = Frame(window)
|
||||
search_button = Button(frame_last, text="Start search", command=self.prepare_search)
|
||||
cancel_button = Button(frame_last, text="Cancel", command=window.destroy)
|
||||
frame_last.pack(side=BOTTOM)
|
||||
search_button.pack(side=LEFT)
|
||||
cancel_button.pack(side=RIGHT)
|
||||
|
||||
frame_name = Frame(window)
|
||||
frame_output_name = Frame(frame_name)
|
||||
label_output_name = Label(frame_output_name, text='Output file:')
|
||||
self.variable_output_name = StringVar()
|
||||
self.variable_output_name.set('results.csv')
|
||||
variables.update({'output_name':self.variable_output_name})
|
||||
self.button_output_name = Button(frame_output_name, command=self.set_output, text="Select file")
|
||||
frame_output_name.pack(side=LEFT)
|
||||
label_output_name.pack()
|
||||
self.button_output_name.pack()
|
||||
frame_name.pack(side=BOTTOM)
|
||||
|
||||
|
||||
frame_checkboxes = Frame(window)
|
||||
frame_checkbox_attributes = Frame(frame_checkboxes)
|
||||
variable_all_attributes = BooleanVar()
|
||||
variable_all_attributes.set(True)
|
||||
input_all_attributes = Checkbutton(frame_checkbox_attributes, text="Search ALL parameters",
|
||||
variable=variable_all_attributes)
|
||||
variables.update({"all_attributes": variable_all_attributes})
|
||||
frame_checkbox_attributes.pack(side=LEFT)
|
||||
input_all_attributes.pack()
|
||||
|
||||
frame_logging = Frame(frame_checkboxes)
|
||||
variable_logging = BooleanVar()
|
||||
variable_logging.set(False)
|
||||
input_logging = Checkbutton(frame_logging, text="Verbose logging", variable=variable_logging)
|
||||
variables.update({'logging':variable_logging})
|
||||
frame_logging.pack(side=RIGHT)
|
||||
frame_checkboxes.pack(side=BOTTOM)
|
||||
input_logging.pack()
|
||||
frame_all_attributes.pack()
|
||||
|
||||
return window, variables
|
||||
|
||||
def prepare_search(self):
|
||||
"""Saves the values from the window for later retrieval."""
|
||||
variables = self.variables
|
||||
values = {}
|
||||
|
||||
values.update({"Always attributes": self.load_always_attributes()})
|
||||
for name, var in variables.iteritems():
|
||||
if var.__class__ is StringVar:
|
||||
values.update({name: var.get()})
|
||||
elif var.__class__ is BooleanVar:
|
||||
values.update({name: var.get()})
|
||||
elif var.__class__ is Text:
|
||||
values.update({name: str(var.get("1.0", END)).strip()})
|
||||
elif var.__class__ is Listbox:
|
||||
values.update({name: ", ".join([var.get(int(i)) for i in var.curselection()])})
|
||||
else:
|
||||
print "No known class, {}, {}".format(name, var)
|
||||
|
||||
values.update({'output_name':self.variable_output_name.get()})
|
||||
values.update({'output_type':self.check_output_type(values.get('output_name'))})
|
||||
|
||||
self.values = values
|
||||
if all([values.get(i) != '' for i in self.required_variables]):
|
||||
self.finish_with_search = True
|
||||
self.window.destroy()
|
||||
else:
|
||||
self.finish_with_search = False
|
||||
#tkMessageBox.showinfo('Not all required information was entered!')
|
||||
|
||||
def execute_search(self):
|
||||
"""Calls the Fourmi crawler with the values from the GUI"""
|
||||
if self.values.get('all_attributes'):
|
||||
attributes = ".*"
|
||||
else:
|
||||
attribute_types = ['attributes', 'Common attributes', 'Always attributes']
|
||||
attributes = ','.join([str(self.values.get(attribute)) for attribute in attribute_types])
|
||||
output_file = "file://" + str(self.values.get('output_name')) #Dealing with absolute paths
|
||||
|
||||
arguments = {'--attributes': attributes,
|
||||
'--exclude': None,
|
||||
'--format': self.values.get('output_type'),
|
||||
'--help': False,
|
||||
'--include': None,
|
||||
'--log': 'log.txt',
|
||||
'--output': output_file,
|
||||
'-v': 0 if self.values.get('logging') else 3,
|
||||
'--version': False,
|
||||
'<compound>': self.values.get('substance'),
|
||||
'list': False,
|
||||
'search': True}
|
||||
|
||||
self.search(arguments, self.sourceloader)
|
||||
|
||||
def run(self):
|
||||
"""Starts the window and the search."""
|
||||
self.window.mainloop()
|
||||
if self.finish_with_search:
|
||||
self.execute_search()
|
||||
|
||||
def check_output_type(self, filename):
|
||||
parts = str(filename).split('.')
|
||||
output_types = self.load_output_types()
|
||||
extension = parts[-1]
|
||||
|
||||
for type in output_types:
|
||||
if extension==type:
|
||||
return extension
|
||||
return output_types[0]
|
@ -48,7 +48,6 @@ __Main goals:__
|
||||
- Build an graphical user interface(GUI) as alternative for the command line
|
||||
interface(CLI). (Assignee: Harmen)
|
||||
- Compiling the source into an windows executable. (Assignee: Bas)
|
||||
- Create an module to gather data from PubChem. (Assignee: Nout)
|
||||
|
||||
__Side goals:__
|
||||
|
||||
|
97
SIGNED.md
97
SIGNED.md
@ -3,19 +3,19 @@
|
||||
-----BEGIN PGP SIGNATURE-----
|
||||
Version: GnuPG v1.4.11 (GNU/Linux)
|
||||
|
||||
iQIcBAABAgAGBQJTn3GgAAoJEJrQ9RIUCT6/CI4P/RSAQrd6JugGZoQu/gNdW6eB
|
||||
MYCybqYGZiieVhUaGOnFNVlp68YpXH+sP/Uc6hXEX30UQEsDmhMeT5NA7ZMS+zJ9
|
||||
MNHGQdJq22lGb3+VoVBV4RTMdkQXOXvx6p5biskjIEtM3tfTxP529GvAX2TFUNnt
|
||||
gGWk28EDr30M95XwDxwWo+57Xv8VtSb3VSvXEbrdwGYf8EoQo9oPtzYQ0YcdupcC
|
||||
ET8bukYVcwpAjoTnPlEy89TiHHohwmimr2ASXeQ64Ks5wfjzcF7NENCAmaAfR+KI
|
||||
VLLuGqdWMBx1ewVuAXTCZ0Mga/kBoRUaO0PC13UmL8LhhZY9Z3cwD4UnPU35/RQi
|
||||
IbLfQcZHf/gEvyMeiTYCsyWpm+/xxn1+EfHol4/Q9VSXzZgRBX05Ik6tqeCvjdgG
|
||||
4PyHBaJTTm/HfMNdg3mr1mbyjTv5UxglEyPv+Y4NdfoVfepkXsXbzvNSyVffZ3Bw
|
||||
UaFp7KzIC4Jugdpv63FleiAdDY0+iZ5shH86wD1+HJ0/a87kn5Ao1yESby7J7U+f
|
||||
poZQYeMFeuC0T5hY/3iYoyvZ68oH918ESESiucSulp5BvfwuqGL2+xo5uJIwGYXE
|
||||
3IDQC7xbA14JHX86IVJlSHAD33iWyiC+5yjw4/bRRVl37KPsLdHiXH3YIRnF5I2I
|
||||
ZbM/uDYyJdZbBe4UoCoF
|
||||
=AMhi
|
||||
iQIcBAABAgAGBQJTpMZAAAoJEJrQ9RIUCT6/Hf8P/AyX9ZD5zj6rBi2CwDOTs5aa
|
||||
flVqw9syvdqTzVfXQaR4UrCSOuyuOeAkiqub0BMjxyCurqAwN/SCPf3uOJ/tGXmt
|
||||
ZPtYVHjevJ4mbojLhZiJ2av8LC9VOh3Zl+reR3L2cLuBD4rVSrfUMJtczbbtNlk+
|
||||
+mczRcTpzNvHQW6mKqyUoKn8xqNnLC7C+p5ybNZ5EADUfoKIF1xyTN6je6fpYZ1U
|
||||
IHxiUzeOvfX9ohmbfnfkpkuSll1nUJWsTgUPKhthJuxEhwCQ1xMdWhxfcyZJaMT2
|
||||
Pxgo8C8S6lzAk4PxBRBoePjgWAeaFmbr317WXHvw6SSHPIdzToKZgDiDC5LWvKxb
|
||||
RRdLZ6w7tg0/FSUexekrUafGT8Je0oIoLUQlNaEQzrPNhDpma1uHFfZg0vb2m4Hq
|
||||
WHLLKTCr6FMczhP1TmuIEtdjKtymT+rO+Ls4ciw+654R7MtBYcmTr+RqmAd+GadJ
|
||||
vJNmGDod2oPwCydEps8bYAbksqRhMmk3xwco/g6dWYh5/+1GzCr80J7fYpqtoPFH
|
||||
V5qKyDQovF5jPlb/buq4mH8XYVT1z4Sx8azKVctMLig57zRnvN0WyskpT09oY7dK
|
||||
TPvIqwTixekndYLcM3QacVq/NhVOOQPFvD0PwU18eKs4EfD2L7iWd2XjV9Az++aD
|
||||
jUY6EwEuOzDCexWP4eM8
|
||||
=h6TK
|
||||
-----END PGP SIGNATURE-----
|
||||
|
||||
```
|
||||
@ -27,38 +27,45 @@ ZbM/uDYyJdZbBe4UoCoF
|
||||
#### Expect
|
||||
|
||||
```
|
||||
size exec file contents
|
||||
./
|
||||
375 .gitignore d2e475a6a4fa51422cac0a07495914e776858fb9ab9c8937a4d491a3e042d6b1
|
||||
464 .travis.yml 3063ba078607b8d16bd6467afc15fbbaa4b26c1e30be5ce7cef453cfccbaa95c
|
||||
428 Changelog.md c7791d1914ddca9ff1549d90468a79787a7feafe94cecd756e3d7cbd4bcbc7df
|
||||
FourmiCrawler/
|
||||
0 __init__.py e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
|
||||
304 items.py b00d49a3d53fa13306c7f8b023adb93ab88423c4fce46600689814f6b02bb806
|
||||
2178 pipelines.py f9b7b84938060751e15e45de5133dffe50c798bff2a20019206fe7c9d677ad49
|
||||
914 settings.py 0be2eaf8e83e85ed27754c896421180fc80cb5ce44449aa9f1048e465d1a96f2
|
||||
sources/
|
||||
9991 ChemSpider.py 847013e34c5c3683ec66a337837287512b4bab9fbea2ece12e4130ab0dbf264d
|
||||
9898 NIST.py 97abc84fce85c47b789822715a1945ab84cc052a32340c861141c1af66bab644
|
||||
4754 PubChem.py 58ed4c92519e385f2768cf8034b006b18f8a21632cb1c5a0849b1a329a8c6ffb
|
||||
6907 WikipediaParser.py 5d6de911c773129a34b76c40a9b547aafc67644a15f39cd0be6afc7a16fb0f97
|
||||
0 __init__.py e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
|
||||
1262 source.py 16c4cdfca849b7dc2bc89d7a6f7ad021f4aa1d04234394312f1d0edf0fd9c5a4
|
||||
3026 spider.py 1ffba2512988b7a6b535a4a31a4ef688ece4f8c595c3d50355c34ef46b23e44a
|
||||
1081 LICENSE 36951e5f1910bad3e008ab7228f35ad8933192e52d3c3ae6a5e875765e27192c
|
||||
3965 README.md d21236d6a175be28ef8e2fee8a256e95b6a513163e3f1071c26c62e9093db7f3
|
||||
3676 x fourmi.py 2ff89f97fd2a49d08417d9ab6cf08e88944d0c45f54ec84550b530be48676c23
|
||||
261 scrapy.cfg 624c068fd06303daa65b8e0d0d3ef88ac1f123be2694ef5b4f3f9a9dcd983f85
|
||||
tests/
|
||||
1 __init__.py 01ba4719c80b6fe911b091a7c05124b64eeece964e09c058ef8f9805daca546b
|
||||
2837 test_configurator.py 4a0eb6e7121eb09a63ab5cb797570d1a42080c5346c3b8b365da56eefa599e80
|
||||
1892 test_pipeline.py 387a336b0f36722a20e712aa033e5771c44f9e92561dd73acffd53d622c52031
|
||||
1260 test_sourceloader.py b108b4b80adcdb7401273a9823b1f1a19eb5178776186eb5a9976aed8b1ee869
|
||||
2113 test_spider.py 300f280377b522737be0d8e4a80031ab118a4011bdbb92131e9c400fcdab6299
|
||||
utils/
|
||||
0 __init__.py e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
|
||||
3552 configurator.py e2b7e0ee6c1fef4373785dfe5df8ec6950f31ce6a5d9632b69a66ea3d1eaf921
|
||||
2537 sourceloader.py f5a5ac2a6aba0658dbe11361f465caabcf3c06c5c8dc9a631874211cc19d2d37
|
||||
size exec file contents
|
||||
./
|
||||
412 .gitignore 25059da2ee328837ece01b979cd5c1083ed1679372f06c14c1c58035d8120614
|
||||
548 .travis.yml 7f11bc58a8e94276ef949afeb107f9f1e184c0dbb84f821705ea2245902ed546
|
||||
846 Changelog.md 345f9aea4812b37b1b2714703ea0d5edd27414c0f839ec3e322450ad5ec5c6ed
|
||||
FourmiCrawler/
|
||||
0 __init__.py e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
|
||||
304 items.py b00d49a3d53fa13306c7f8b023adb93ab88423c4fce46600689814f6b02bb806
|
||||
2178 pipelines.py f9b7b84938060751e15e45de5133dffe50c798bff2a20019206fe7c9d677ad49
|
||||
677 settings.py f1e7d21b899ffc2523516c0ebe67d967dc62495b90c2fe34651042a3049fcd94
|
||||
sources/
|
||||
12103 ChemSpider.py f647d70acf9b3f1ee7bde75586aa45156331f977ca7fe836ceac4477a2c0d4ce
|
||||
12400 NIST.py cdb4c423355ac8fb1097197a9f8df44f667925a785c6bae7c583820da08908ee
|
||||
6121 PubChem.py 8f8ad40459090b818a384a202e739fe4696a04154df2b8419aee896b0fa02481
|
||||
6930 WikipediaParser.py ae9f57bbf2aad9c371abcd143fd2dda5995a196cb700734a5035dd94b1988870
|
||||
0 __init__.py e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
|
||||
1281 source.py 7927fda259ff2c8096fa526db1f08586de6e04473a491e19a07b092fdeed81fc
|
||||
3111 spider.py ec7c946907fea10c17ee6dd88a506f3e3bf2cd748e3eb09200487fcec2ae7ba3
|
||||
GUI/
|
||||
11 __init__.py 40567015c415e853210425c1b4f3834dbc2a3165e3713e04dd3424b79bc90aa3
|
||||
940 configImporter.py 5d731d63a3117b25b7e556a746a1dd5b16e8cbb60e57be46de333c31c8c00271
|
||||
8776 gui.py 20b2220bc3ca55ebfd6d04e8c0bebbf1ae316c85a54db60b8fc02d22642f19d5
|
||||
299 GUI.cfg.sample 4ee27f7099d588c21358cd645a21621e631d80712f1b514dad898faa5fee2483
|
||||
1081 LICENSE 36951e5f1910bad3e008ab7228f35ad8933192e52d3c3ae6a5e875765e27192c
|
||||
3900 README.md f4a1e3ea1700d2b415acfad661cb45f960fe8e8ffbe98dbecb6c7ed071a101ac
|
||||
3846 x fourmi.py f0b11f5f153f96f6af2e504cdf369e43c04316752de131a659eb6246fd80212a
|
||||
261 scrapy.cfg 624c068fd06303daa65b8e0d0d3ef88ac1f123be2694ef5b4f3f9a9dcd983f85
|
||||
416 sources.cfg.sample 11cd0fc18693da17883c98d25a384ae1b6158adfef13778b6dd02b878f6b8a70
|
||||
tests/
|
||||
107 __init__.py ce90e54e58a0912cadbe3adcf5166dc72477bf9ce289bf427f8e2f5b25406670
|
||||
2870 test_configurator.py 318d542b1cda5075a2a9a6be97e9e7a79372ee58e1ab3014c161534094f7364d
|
||||
1315 test_gui.py 0fb95d0b542765bf52bcebb037bf2ed1299209beab23448af741a93c9fbb1ca8
|
||||
1892 test_pipeline.py 387a336b0f36722a20e712aa033e5771c44f9e92561dd73acffd53d622c52031
|
||||
1260 test_sourceloader.py b108b4b80adcdb7401273a9823b1f1a19eb5178776186eb5a9976aed8b1ee869
|
||||
2113 test_spider.py 300f280377b522737be0d8e4a80031ab118a4011bdbb92131e9c400fcdab6299
|
||||
utils/
|
||||
40 __init__.py f1237ae74693e2ec1b3154e57aec27438a80a735e5ccf2411aecd194ef443b6a
|
||||
4047 configurator.py 8b566a0435a9f105a8ec616b16c3e21edb9b82f8debe1ef9f1df6bbbf20949d5
|
||||
2537 sourceloader.py f5a5ac2a6aba0658dbe11361f465caabcf3c06c5c8dc9a631874211cc19d2d37
|
||||
```
|
||||
|
||||
#### Ignore
|
||||
|
15
fourmi.py
15
fourmi.py
@ -1,8 +1,9 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Fourmi, a web scraper build to search specific information for a given compound (and it's pseudonyms).
|
||||
Fourmi, a web scraper build to search specific information for a given compound (and its pseudonyms).
|
||||
|
||||
Usage:
|
||||
fourmi
|
||||
fourmi search <compound>
|
||||
fourmi [options] search <compound>
|
||||
fourmi [options] [-v | -vv | -vvv] [--include=<sourcename> | --exclude=<sourcename>] search <compound>
|
||||
@ -17,7 +18,7 @@ Options:
|
||||
--version Show version.
|
||||
-v Verbose logging output. (Multiple occurrences increase logging level)
|
||||
--log=<file> Save log to an file.
|
||||
-o <file> --output=<file> Output file [default: results.*format*]
|
||||
-o <file> --output=<file> Output file [default: <compound>.*format*]
|
||||
-f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: csv]
|
||||
--include=<regex> Include only sources that match these regular expressions split by a comma.
|
||||
--exclude=<regex> Exclude the sources that match these regular expressions split by a comma.
|
||||
@ -31,6 +32,7 @@ import docopt
|
||||
from FourmiCrawler.spider import FourmiSpider
|
||||
from utils.configurator import Configurator
|
||||
from utils.sourceloader import SourceLoader
|
||||
from GUI import gui
|
||||
|
||||
|
||||
def setup_crawler(compound, settings, source_loader, attributes):
|
||||
@ -58,18 +60,18 @@ def search(docopt_arguments, source_loader):
|
||||
"""
|
||||
conf = Configurator()
|
||||
conf.set_logging(docopt_arguments["--log"], docopt_arguments["-v"])
|
||||
conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"])
|
||||
conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"], docopt_arguments["<compound>"])
|
||||
setup_crawler(docopt_arguments["<compound>"], conf.scrapy_settings,
|
||||
source_loader, docopt_arguments["--attributes"].split(','))
|
||||
if conf.scrapy_settings.getbool("LOG_ENABLED"):
|
||||
log.start(conf.scrapy_settings.get("LOG_FILE"),
|
||||
conf.scrapy_settings.get("LOG_LEVEL"), conf.scrapy_settings.get("LOG_STDOUT"))
|
||||
conf.scrapy_settings.get("LOG_LEVEL"), conf.scrapy_settings.get("LOG_STDOUT"))
|
||||
reactor.run()
|
||||
|
||||
|
||||
# The start for the Fourmi Command Line interface.
|
||||
if __name__ == '__main__':
|
||||
arguments = docopt.docopt(__doc__, version='Fourmi - V0.5.3')
|
||||
arguments = docopt.docopt(__doc__, version='Fourmi - V0.6.0')
|
||||
loader = SourceLoader()
|
||||
|
||||
if arguments["--include"]:
|
||||
@ -82,3 +84,6 @@ if __name__ == '__main__':
|
||||
elif arguments["list"]:
|
||||
print "-== Available Sources ==-"
|
||||
print str(loader)
|
||||
else:
|
||||
gui_window = gui.GUI(search, sourceloader=SourceLoader())
|
||||
gui_window.run()
|
||||
|
19
sources.cfg.sample
Normal file
19
sources.cfg.sample
Normal file
@ -0,0 +1,19 @@
|
||||
[DEFAULT]
|
||||
reliability = Unknown
|
||||
|
||||
#For each source listed in FourmiCrawler/sources there should be a section
|
||||
#named exactly as the filename in here. If not present, the DEFAULT value is
|
||||
#used for reliability of that source.
|
||||
|
||||
[ChemSpider]
|
||||
reliability = High
|
||||
#token=Paste ChemSpider API token here and remove the hashtag
|
||||
|
||||
[NIST]
|
||||
reliability = High
|
||||
|
||||
[WikipediaParser]
|
||||
reliability = Medium
|
||||
|
||||
[PubChem]
|
||||
reliability = High
|
@ -1 +1,6 @@
|
||||
import test_configurator
|
||||
import test_gui
|
||||
import test_pipeline
|
||||
import test_sourceloader
|
||||
import test_spider
|
||||
|
||||
|
@ -10,16 +10,16 @@ class TestConfigurator(unittest.TestCase):
|
||||
self.conf = Configurator()
|
||||
|
||||
def test_set_output(self):
|
||||
self.conf.set_output(filename="test.txt", fileformat="csv")
|
||||
self.conf.set_output(filename="test.txt", fileformat="csv", compound="test")
|
||||
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.txt")
|
||||
self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv")
|
||||
|
||||
self.conf.set_output("results.*format*", "jsonlines")
|
||||
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.json")
|
||||
self.conf.set_output("<compound>.*format*", "jsonlines", "test")
|
||||
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.json")
|
||||
self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "jsonlines")
|
||||
|
||||
self.conf.set_output("results.*format*", "csv")
|
||||
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.csv")
|
||||
self.conf.set_output("<compound>.*format*", "csv", "test")
|
||||
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.csv")
|
||||
self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv")
|
||||
|
||||
def test_start_log(self):
|
||||
|
32
tests/test_gui.py
Normal file
32
tests/test_gui.py
Normal file
@ -0,0 +1,32 @@
|
||||
import unittest
|
||||
|
||||
from GUI import gui
|
||||
|
||||
class TestGUI(unittest.TestCase):
|
||||
def setUp(self):
|
||||
pass
|
||||
|
||||
def test_empty_attributes(self):
|
||||
self.test_gui = gui.GUI(None, config_file="../GUI.cfg.sample", in_source=True)
|
||||
self.test_gui.window.after(9, self.test_gui.prepare_search)
|
||||
self.test_gui.window.after(11, self.test_gui.window.destroy)
|
||||
self.test_gui.run()
|
||||
|
||||
output_type = self.test_gui.configurator.load_output_types().split(',')[0]
|
||||
|
||||
self.assertEqual(self.test_gui.values.get('substance'), '')
|
||||
self.assertEqual(self.test_gui.values.get('output_type'), output_type)
|
||||
self.assertEqual(self.test_gui.values.get('output_name'), 'results.csv')
|
||||
|
||||
|
||||
def test_no_configurations(self):
|
||||
self.test_gui = gui.GUI(None, config_file="../GUI.cfg.sample")
|
||||
self.test_gui.configurator = gui.ConfigImporter('')
|
||||
self.test_gui.finish_with_search = True
|
||||
self.test_gui.window.after(9, self.test_gui.prepare_search)
|
||||
self.test_gui.window.after(11, self.test_gui.window.destroy)
|
||||
self.test_gui.run()
|
||||
|
||||
self.assertEqual(self.test_gui.values.get('substance'), '')
|
||||
self.assertEqual(self.test_gui.values.get('output_type'), 'csv')
|
||||
self.assertEqual(self.test_gui.values.get('output_name'), 'results.csv')
|
@ -0,0 +1,2 @@
|
||||
import configurator
|
||||
import sourceloader
|
@ -1,4 +1,6 @@
|
||||
import ConfigParser
|
||||
import os
|
||||
import shutil
|
||||
|
||||
from scrapy.utils.project import get_project_settings
|
||||
|
||||
@ -12,7 +14,7 @@ class Configurator:
|
||||
def __init__(self):
|
||||
self.scrapy_settings = get_project_settings()
|
||||
|
||||
def set_output(self, filename, fileformat):
|
||||
def set_output(self, filename, fileformat, compound):
|
||||
"""
|
||||
This function manipulates the Scrapy output file settings that normally would be set in the settings file.
|
||||
In the Fourmi project these are command line arguments.
|
||||
@ -20,12 +22,12 @@ class Configurator:
|
||||
:param fileformat: The format in which the output will be.
|
||||
"""
|
||||
|
||||
if filename != 'results.*format*':
|
||||
if filename != '<compound>.*format*':
|
||||
self.scrapy_settings.overrides["FEED_URI"] = filename
|
||||
elif fileformat == "jsonlines":
|
||||
self.scrapy_settings.overrides["FEED_URI"] = "results.json"
|
||||
self.scrapy_settings.overrides["FEED_URI"] = compound + ".json"
|
||||
elif fileformat is not None:
|
||||
self.scrapy_settings.overrides["FEED_URI"] = "results." + fileformat
|
||||
self.scrapy_settings.overrides["FEED_URI"] = compound + "." + fileformat
|
||||
|
||||
if fileformat is not None:
|
||||
self.scrapy_settings.overrides["FEED_FORMAT"] = fileformat
|
||||
@ -66,8 +68,16 @@ class Configurator:
|
||||
variables for sources
|
||||
:return a ConfigParser object of sources.cfg
|
||||
"""
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
config_path = current_dir + '/../sources.cfg'
|
||||
# [TODO]: location of sources.cfg should be softcoded eventually
|
||||
if not os.path.isfile(config_path):
|
||||
try:
|
||||
shutil.copyfile(os.path.dirname(os.path.abspath(__file__)) + "/../sources.cfg.sample", config_path)
|
||||
except IOError:
|
||||
print "WARNING: Source configuration couldn't be found and couldn't be created."
|
||||
config = ConfigParser.ConfigParser()
|
||||
config.read('sources.cfg') # [TODO]: should be softcoded eventually
|
||||
config.read(config_path)
|
||||
return config
|
||||
|
||||
@staticmethod
|
||||
|
Reference in New Issue
Block a user