Archived
1
0

Merge remote-tracking branch 'origin/develop' into feature/GUI

This commit is contained in:
Harmen Prins 2014-05-30 14:58:31 +02:00
commit a99acfd008
8 changed files with 414 additions and 27 deletions

View File

@ -5,6 +5,22 @@
import re
from scrapy.exceptions import DropItem
class RemoveNonePipeline(object):
def __init__(self):
self.known_values = set()
def process_item(self, item, spider):
"""
Processing the items so None values are replaced by empty strings
:param item: The incoming item
:param spider: The spider which scraped the spider
:return: :raise DropItem: Returns the item if unique or drops them if it's already known
"""
for key in item:
if item[key] is None:
item[key] = ""
return item
class DuplicatePipeline(object):

View File

@ -11,8 +11,9 @@ BOT_NAME = 'FourmiCrawler'
SPIDER_MODULES = ['FourmiCrawler']
NEWSPIDER_MODULE = 'FourmiCrawler'
ITEM_PIPELINES = {
'FourmiCrawler.pipelines.AttributeSelectionPipeline': 100,
'FourmiCrawler.pipelines.DuplicatePipeline': 200,
"FourmiCrawler.pipelines.RemoveNonePipeline": 100,
'FourmiCrawler.pipelines.AttributeSelectionPipeline': 200,
'FourmiCrawler.pipelines.DuplicatePipeline': 300,
}
FEED_URI = 'results.json'
FEED_FORMAT = 'jsonlines'

View File

@ -47,7 +47,6 @@ class ChemSpider(Source):
properties = []
# Predicted - ACD/Labs tab
# [TODO] - test if tab contains data, some chemicals do not have data here
td_list = sel.xpath('.//table[@id="acdlabs-table"]//td').xpath(
'normalize-space(string())')
prop_names = td_list[::2]
@ -58,6 +57,12 @@ class ChemSpider(Source):
prop_value = prop_value.extract().encode('utf-8')
prop_conditions = ''
# Test for properties without values, with one hardcoded exception
if (not re.match(r'^\d', prop_value) or
(prop_name == 'Polarizability' and
prop_value == '10-24cm3')):
continue
# Match for condition in parentheses
m = re.match(r'(.*) \((.*)\)', prop_name)
if m:
@ -192,7 +197,8 @@ class ChemSpider(Source):
'reliability': 'Unknown',
'conditions': ''
})
properties.append(result)
if result['value']:
properties.append(result)
return properties
def parse_searchrequest(self, response):
@ -200,8 +206,14 @@ class ChemSpider(Source):
sel = Selector(response)
log.msg('chemspider parse_searchrequest', level=log.DEBUG)
sel.register_namespace('cs', 'http://www.chemspider.com/')
csid = sel.xpath('.//cs:int/text()').extract()[0]
# [TODO] - handle multiple csids in case of vague search term
csids = sel.xpath('.//cs:int/text()').extract()
if len(csids) == 0:
log.msg('ChemSpider found nothing', level=log.ERROR)
return
elif len(csids) > 1:
log.msg('ChemSpider found multiple substances, taking first '
'element', level=log.DEBUG)
csid = csids[0]
structure_url = self.website[:-1] + self.structure % csid
extendedinfo_url = self.website[:-1] + self.extendedinfo % csid
log.msg('chemspider URL: %s' % structure_url, level=log.DEBUG)
@ -215,4 +227,4 @@ class ChemSpider(Source):
return None
searchurl = self.website[:-1] + self.search % compound
log.msg('chemspider compound', level=log.DEBUG)
return Request(url=searchurl, callback=self.parse_searchrequest)
return Request(url=searchurl, callback=self.parse_searchrequest)

View File

@ -0,0 +1,273 @@
from source import Source
from scrapy import log
from scrapy.http import Request
from scrapy.selector import Selector
from FourmiCrawler.items import Result
import re
# [TODO]: values can be '128.', perhaps remove the dot in that case?
# [TODO]: properties have references and comments which do not exist in the
# Result item, but should be included eventually.
class NIST(Source):
"""NIST Scraper plugin
This plugin manages searching for a chemical on the NIST website
and parsing the resulting page if the chemical exists on NIST.
"""
website = "http://webbook.nist.gov/*"
search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on'
ignore_list = set()
def __init__(self):
Source.__init__(self)
def parse(self, response):
sel = Selector(response)
title = sel.xpath('head/title/text()').extract()[0]
if title == 'Name Not Found':
log.msg('NIST: Chemical not found!', level=log.ERROR)
return
if title not in self.ignore_list:
self.ignore_list.update(title)
log.msg('NIST emit synonym: %s' % title, level=log.DEBUG)
self._spider.get_synonym_requests(title)
requests = []
requests.extend(self.parse_generic_info(sel))
symbol_table = {}
tds = sel.xpath('//table[@class="symbol_table"]/tr/td')
for (symbol_td, name_td) in zip(tds[::2], tds[1::2]):
symbol = ''.join(symbol_td.xpath('node()').extract())
name = name_td.xpath('text()').extract()[0]
symbol_table[symbol] = name
log.msg('NIST symbol: |%s|, name: |%s|' % (symbol, name),
level=log.DEBUG)
for table in sel.xpath('//table[@class="data"]'):
summary = table.xpath('@summary').extract()[0]
if summary == 'One dimensional data':
log.msg('NIST table: Aggregrate data', level=log.DEBUG)
requests.extend(
self.parse_aggregate_data(table, symbol_table))
elif table.xpath('tr/th="Initial Phase"').extract()[0] == '1':
log.msg('NIST table; Enthalpy/entropy of phase transition',
level=log.DEBUG)
requests.extend(self.parse_transition_data(table, summary))
elif table.xpath('tr[1]/td'):
log.msg('NIST table: Horizontal table', level=log.DEBUG)
elif summary == 'Antoine Equation Parameters':
log.msg('NIST table: Antoine Equation Parameters',
level=log.DEBUG)
requests.extend(self.parse_antoine_data(table, summary))
elif len(table.xpath('tr[1]/th')) == 5:
log.msg('NIST table: generic 5 columns', level=log.DEBUG)
# Symbol (unit) Temperature (K) Method Reference Comment
requests.extend(self.parse_generic_data(table, summary))
elif len(table.xpath('tr[1]/th')) == 4:
log.msg('NIST table: generic 4 columns', level=log.DEBUG)
# Symbol (unit) Temperature (K) Reference Comment
requests.extend(self.parse_generic_data(table, summary))
else:
log.msg('NIST table: NOT SUPPORTED', level=log.WARNING)
continue #Assume unsupported
return requests
def parse_generic_info(self, sel):
"""Parses: synonyms, chemical formula, molecular weight, InChI,
InChiKey, CAS number
"""
ul = sel.xpath('body/ul[li/strong="IUPAC Standard InChI:"]')
li = ul.xpath('li')
raw_synonyms = ul.xpath('li[strong="Other names:"]/text()').extract()
for synonym in raw_synonyms[0].strip().split(';\n'):
log.msg('NIST synonym: %s' % synonym, level=log.DEBUG)
self.ignore_list.update(synonym)
self._spider.get_synonym_requests(synonym)
data = {}
raw_formula = ul.xpath('li[strong/a="Formula"]//text()').extract()
data['Chemical formula'] = ''.join(raw_formula[2:]).strip()
raw_mol_weight = ul.xpath('li[strong/a="Molecular weight"]/text()')
data['Molecular weight'] = raw_mol_weight.extract()[0].strip()
raw_inchi = ul.xpath('li[strong="IUPAC Standard InChI:"]//tt/text()')
data['IUPAC Standard InChI'] = raw_inchi.extract()[0]
raw_inchikey = ul.xpath('li[strong="IUPAC Standard InChIKey:"]'
'/tt/text()')
data['IUPAC Standard InChIKey'] = raw_inchikey.extract()[0]
raw_cas_number = ul.xpath('li[strong="CAS Registry Number:"]/text()')
data['CAS Registry Number'] = raw_cas_number.extract()[0].strip()
requests = []
for key, value in data.iteritems():
result = Result({
'attribute': key,
'value': value,
'source': 'NIST',
'reliability': 'Unknown',
'conditions': ''
})
requests.append(result)
return requests
def parse_aggregate_data(self, table, symbol_table):
"""Parses the table(s) which contain possible links to individual
data points
"""
results = []
for tr in table.xpath('tr[td]'):
extra_data_url = tr.xpath('td[last()][a="Individual data points"]'
'/a/@href').extract()
if extra_data_url:
request = Request(url=self.website[:-1] + extra_data_url[0],
callback=self.parse_individual_datapoints)
results.append(request)
continue
data = []
for td in tr.xpath('td'):
data.append(''.join(td.xpath('node()').extract()))
name = symbol_table[data[0]]
condition = ''
m = re.match(r'(.*) at (.*)', name)
if m:
name = m.group(1)
condition = m.group(2)
result = Result({
'attribute': name,
'value': data[1] + ' ' + data[2],
'source': 'NIST',
'reliability': 'Unknown',
'conditions': condition
})
log.msg('NIST: |%s|' % data, level=log.DEBUG)
results.append(result)
return results
@staticmethod
def parse_transition_data(table, summary):
"""Parses the table containing properties regarding phase changes"""
results = []
tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract())
m = re.search(r'\((.*)\)', tr_unit)
unit = '!'
if m:
unit = m.group(1)
for tr in table.xpath('tr[td]'):
tds = tr.xpath('td/text()').extract()
result = Result({
'attribute': summary,
'value': tds[0] + ' ' + unit,
'source': 'NIST',
'reliability': 'Unknown',
'conditions': '%s K, (%s -> %s)' % (tds[1], tds[2], tds[3])
})
results.append(result)
return results
@staticmethod
def parse_generic_data(table, summary):
"""Parses the common tables of 4 and 5 rows. Assumes they are of the
form:
Symbol (unit)|Temperature (K)|Method|Reference|Comment
Symbol (unit)|Temperature (K)|Reference|Comment
"""
results = []
tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract())
m = re.search(r'\((.*)\)', tr_unit)
unit = '!'
if m:
unit = m.group(1)
for tr in table.xpath('tr[td]'):
tds = tr.xpath('td/text()').extract()
result = Result({
'attribute': summary,
'value': tds[0] + ' ' + unit,
'source': 'NIST',
'reliability': 'Unknown',
'conditions': '%s K' % tds[1]
})
results.append(result)
return results
@staticmethod
def parse_antoine_data(table, summary):
"""Parse table containing parameters for the Antione equation"""
results = []
for tr in table.xpath('tr[td]'):
tds = tr.xpath('td/text()').extract()
result = Result({
'attribute': summary,
'value': 'A=%s, B=%s, C=%s' % (tds[1], tds[2], tds[3]),
'source': 'NIST',
'reliability': 'Unknown',
'conditions': '%s K' % tds[0]
})
results.append(result)
return results
def parse_individual_datapoints(self, response):
"""Parses the page linked from aggregate data"""
sel = Selector(response)
table = sel.xpath('//table[@class="data"]')[0]
results = []
name = table.xpath('@summary').extract()[0]
condition = ''
m = re.match(r'(.*) at (.*)', name)
if m:
name = m.group(1)
condition = m.group(2)
tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract())
m = re.search(r'\((.*)\)', tr_unit)
unit = '!'
if m:
unit = m.group(1)
for tr in table.xpath('tr[td]'):
tds = tr.xpath('td/text()').extract()
uncertainty = ''
m = re.search('Uncertainty assigned by TRC = (.*?) ', tds[-1])
if m:
uncertainty = '+- %s ' % m.group(1)
# [TODO]: get the plusminus sign working in here
result = Result({
'attribute': name,
'value': '%s %s%s' % (tds[0], uncertainty, unit),
'source': 'NIST',
'reliability': 'Unknown',
'conditions': condition
})
results.append(result)
return results
def new_compound_request(self, compound):
if compound not in self.ignore_list:
self.ignore_list.update(compound)
return Request(url=self.website[:-1] + self.search % compound,
callback=self.parse)

View File

@ -36,8 +36,8 @@ class WikipediaParser(Source):
""" scrape data from infobox on wikipedia. """
items = []
#be sure to get both chembox (wikipedia template) and drugbox (wikipedia template) to scrape
tr_list = sel.xpath('.//table[@class="infobox bordered" or @class="infobox"]//td[not(@colspan)]').\
#be sure to get chembox (wikipedia template)
tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]'). \
xpath('normalize-space(string())')
prop_names = tr_list[::2]
prop_values = tr_list[1::2]
@ -46,11 +46,31 @@ class WikipediaParser(Source):
'attribute': prop_name.extract().encode('utf-8'),
'value': prop_values[i].extract().encode('utf-8'),
'source': "Wikipedia",
'reliability': "",
'reliability': "Unknown",
'conditions': ""
})
items.append(item)
log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG)
#scrape the drugbox (wikipedia template)
tr_list2 = sel.xpath('.//table[@class="infobox"]//tr')
log.msg('dit: %s' % tr_list2, level=log.DEBUG)
for tablerow in tr_list2:
log.msg('item: %s' % tablerow.xpath('./th').xpath('normalize-space(string())'), level=log.DEBUG)
if tablerow.xpath('./th').xpath('normalize-space(string())') and tablerow.xpath('./td').xpath(
'normalize-space(string())'):
item = Result({
'attribute': tablerow.xpath('./th').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
'value': tablerow.xpath('./td').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
'source': "Wikipedia",
'reliability': "Unknown",
'conditions': ""
})
items.append(item)
log.msg(
'Wiki prop: |attribute: %s| |value: %s| |%s|' % (item['attribute'], item['value'], item['source']),
level=log.DEBUG)
items = filter(lambda a: a['value'] != '', items) # remove items with an empty value
item_list = self.clean_items(items)

81
README.md Normal file
View File

@ -0,0 +1,81 @@
# Fourmi
Fourmi is an web scraper for chemical substances. The program is designed to be
used as a search engine to search multiple chemical databases for a specific
substance. The program will produce all available attributes of the substance
and conditions associated with the attributes. Fourmi also attempts to estimate
the reliability of each data point to assist the user in deciding which data
should be used.
The Fourmi project is open source project licensed under the MIT license. Feel
free to contribute!
Fourmi is based on the [Scrapy framework](http://scrapy.org/), an open source
web scraping framework for python. Most of the functionality of this project can
be traced to this framework. Should the documentation for this application fall
short, we suggest you take a close look at the [Scrapy architecture]
(http://doc.scrapy.org/en/latest/topics/architecture.html) and the [Scrapy
documentation](http://doc.scrapy.org/en/latest/index.html).
### Installing
If you're installing Fourmi, please take a look at our [installation guide](...)
on our wiki. When you've installed the application, make sure to check our
[usage guide](...).
### Using the Source
To use the Fourmi source code multiple dependencies are required. Take a look at
the [wiki page](...) on using the application source code for a step by step
installation guide.
When developing for the Fourmi project keep in mind that code readability is a
must. To maintain the readability, code should be conform with the
[PEP-8](http://legacy.python.org/dev/peps/pep-0008/) style guide for Python
code. More information about the different structures and principles of the
Fourmi application can be found on our [wiki](...).
### To Do
The Fourmi project has the following goals for the nearby future:
__Main goals:__
- Improve our documentation and guides. (Assignee: Dekker)
- Build an graphical user interface(GUI) as alternative for the command line
interface(CLI). (Assignee: Harmen)
- Compiling the source into an windows executable. (Assignee: Bas)
- Create an configuration file to hold logins and API keys.
- Determine reliability of our data point.
- Create an module to gather data from NIST. (Assignee: Rob)
- Create an module to gather data from PubChem. (Assignee: Nout)
__Side goals:__
- Clean and unify data.
- Extensive reliability analysis using statistical tests.
- Test data with Descartes 1.
### Project Origin
The Fourmi project was started in February of 2014 as part of a software
engineering course at the Radboud University for students studying Computer
Science, Information Science or Artificial Intelligence. Students participate in
a real software development project as part of the
[Giphouse](http://www.giphouse.nl/).
This particular project was started on behalf of Ivo B. Rietveld. As a chemist
he was in need of an application to automatically search information on chemical
substances and create an phase diagram. The so called "Descrates" project was
split into two teams each creating a different application that has part of the
functionality. We are the team Descartes 2 and as we were responsible for
creating a web crawler, we've named our application Fourmi (Englis: Ants).
The following people were part of the original team:
- [Jip J. Dekker](http://jip.dekker.li)
- Rob ten Berge
- Harmen Prins
- Bas van Berkel
- Nout van Deijck
- Michail Kuznetcov

View File

@ -1,16 +0,0 @@
We are the team Descartes 2.
----------------------------
Our team members are:
+ Rob ten Berge
+ Bas van Berkel
+ Nout van Deijck
+ Jip J. Dekker
+ Michail Kuznetcov
+ Harmen Prins

View File

@ -80,7 +80,7 @@ def search(docopt_arguments, source_loader):
if __name__ == '__main__':
arguments = docopt.docopt(__doc__, version='Fourmi - V0.2.6')
arguments = docopt.docopt(__doc__, version='Fourmi - V0.4.0')
loader = SourceLoader()
if arguments["--include"]: