Archived
1
0

added comments

This commit is contained in:
Nout van Deijck 2014-04-23 16:17:23 +02:00
parent 9cefd336e0
commit 150fc5bea7

View File

@ -8,9 +8,11 @@ import re
class WikipediaParser(Parser):
# General notes:
# Redirects seem to not matter as Wikipedia returns the page the redirect forwards to
# although this might lead to scraping both the original and the redirect with the same data.
""" Wikipedia scraper for chemical properties
This parser parses Wikipedia infoboxes (also bordered) to obtain properties and their values.
It also returns requests with other external sources which contain information on parsed subject.
"""
website = "http://en.wikipedia.org/wiki/*"
__spider = None
@ -31,10 +33,12 @@ class WikipediaParser(Parser):
return items
def parse_infobox(self, sel):
#scrape data from infobox on wikipedia.
items = []
tr_list = sel.xpath('.//table[@class="infobox bordered" or @class="infobox"]//td[not(@colspan)]').xpath('normalize-space(string())')
#be sure to get both chembox (wikipedia template) and drugbox (wikipedia template) to scrape
tr_list = sel.xpath('.//table[@class="infobox bordered" or @class="infobox"]//td[not(@colspan)]').\
xpath('normalize-space(string())')
prop_names = tr_list[::2]
prop_values = tr_list[1::2]
for i, prop_name in enumerate(prop_names):
@ -52,10 +56,14 @@ class WikipediaParser(Parser):
identifiers = self.get_identifiers(sel)
#add extra sources to scrape from as requests
for i, identifier in enumerate(identifiers):
if re.match('//en\.wikipedia',identifier):
request = None
#discard internal wikipedia links
if re.match('//en\.wikipedia', identifier):
log.msg('Found link to wikipedia, this is not something to scrape: %s' % identifier, level=log.WARNING)
elif re.match('/{2}',identifier):
#fix links starting with '//www.'
elif re.match('/{2}', identifier):
identifier = re.sub("/{2}", "http://", identifier)
request = Request(identifier)
else:
@ -68,18 +76,22 @@ class WikipediaParser(Parser):
def new_compound_request(self, compound):
return Request(url=self.website[:-1] + compound, callback=self.parse)
def cleanitems(self, items):
@staticmethod
def cleanitems(items):
#clean up properties using regex, makes it possible to split the values from the units
for item in items:
value = item['value']
m = re.search('F;\s(\d+[\.,]?\d*)', value)
m = re.search('F;\s(\d+[\.,]?\d*)', value) # clean up numerical Kelvin value (after F)
if m:
item['value'] = m.group(1) + " K"
m = re.match('(\d+[\.,]?\d*)\sJ\sK.+mol', value)
m = re.match('(\d+[\.,]?\d*)\sJ\sK.+mol', value) # clean up J/K/mol values
if m:
item['value'] = m.group(1) + " J/K/mol"
return items
def get_identifiers(self, sel):
@staticmethod
def get_identifiers(sel):
#find external links, named 'Identifiers' to different sources.
links = sel.xpath('//span[contains(concat(" ",normalize-space(@class)," "),"reflink")]/a'
'[contains(concat(" ",normalize-space(@class)," "),"external")]/@href').extract()
return links