Archived
1
0

Spelling errors

This commit is contained in:
Jip J. Dekker 2014-04-23 22:58:04 +02:00
parent c5bffffeda
commit d523d4edcd

View File

@ -7,7 +7,6 @@ import re
class WikipediaParser(Source): class WikipediaParser(Source):
""" Wikipedia scraper for chemical properties """ Wikipedia scraper for chemical properties
This parser parses Wikipedia info boxes (also bordered) to obtain properties and their values. This parser parses Wikipedia info boxes (also bordered) to obtain properties and their values.
@ -53,7 +52,7 @@ class WikipediaParser(Source):
items.append(item) items.append(item)
log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG) log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG)
items = filter(lambda a: a['value'] != '', items) # remove items with an empty value items = filter(lambda a: a['value'] != '', items) # remove items with an empty value
itemlist = self.cleanitems(items) item_list = self.clean_items(items)
identifiers = self.get_identifiers(sel) identifiers = self.get_identifiers(sel)
@ -62,7 +61,7 @@ class WikipediaParser(Source):
request = None request = None
#discard internal wikipedia links #discard internal wikipedia links
if re.match('//en\.wikipedia', identifier): if re.match('//en\.wikipedia', identifier):
log.msg('Found link to wikipedia, this is not something to scrape: %s' % identifier, level=log.WARNING) log.msg('Found link to Wikipedia, this is not something to scrape: %s' % identifier, level=log.WARNING)
#fix links starting with '//www.' #fix links starting with '//www.'
elif re.match('/{2}', identifier): elif re.match('/{2}', identifier):
identifier = re.sub("/{2}", "http://", identifier) identifier = re.sub("/{2}", "http://", identifier)
@ -70,15 +69,15 @@ class WikipediaParser(Source):
else: else:
request = Request(identifier) request = Request(identifier)
log.msg('New identifier found, request: %s' % identifier, level=log.DEBUG) log.msg('New identifier found, request: %s' % identifier, level=log.DEBUG)
itemlist.append(request) item_list.append(request)
return itemlist return item_list
def new_compound_request(self, compound): def new_compound_request(self, compound):
return Request(url=self.website[:-1] + compound, callback=self.parse) return Request(url=self.website[:-1] + compound, callback=self.parse)
@staticmethod @staticmethod
def cleanitems(items): def clean_items(items):
""" clean up properties using regex, makes it possible to split the values from the units """ """ clean up properties using regex, makes it possible to split the values from the units """
for item in items: for item in items:
value = item['value'] value = item['value']