Spelling errors
This commit is contained in:
parent
c5bffffeda
commit
d523d4edcd
@ -7,7 +7,6 @@ import re
|
|||||||
|
|
||||||
|
|
||||||
class WikipediaParser(Source):
|
class WikipediaParser(Source):
|
||||||
|
|
||||||
""" Wikipedia scraper for chemical properties
|
""" Wikipedia scraper for chemical properties
|
||||||
|
|
||||||
This parser parses Wikipedia info boxes (also bordered) to obtain properties and their values.
|
This parser parses Wikipedia info boxes (also bordered) to obtain properties and their values.
|
||||||
@ -53,7 +52,7 @@ class WikipediaParser(Source):
|
|||||||
items.append(item)
|
items.append(item)
|
||||||
log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG)
|
log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG)
|
||||||
items = filter(lambda a: a['value'] != '', items) # remove items with an empty value
|
items = filter(lambda a: a['value'] != '', items) # remove items with an empty value
|
||||||
itemlist = self.cleanitems(items)
|
item_list = self.clean_items(items)
|
||||||
|
|
||||||
identifiers = self.get_identifiers(sel)
|
identifiers = self.get_identifiers(sel)
|
||||||
|
|
||||||
@ -62,7 +61,7 @@ class WikipediaParser(Source):
|
|||||||
request = None
|
request = None
|
||||||
#discard internal wikipedia links
|
#discard internal wikipedia links
|
||||||
if re.match('//en\.wikipedia', identifier):
|
if re.match('//en\.wikipedia', identifier):
|
||||||
log.msg('Found link to wikipedia, this is not something to scrape: %s' % identifier, level=log.WARNING)
|
log.msg('Found link to Wikipedia, this is not something to scrape: %s' % identifier, level=log.WARNING)
|
||||||
#fix links starting with '//www.'
|
#fix links starting with '//www.'
|
||||||
elif re.match('/{2}', identifier):
|
elif re.match('/{2}', identifier):
|
||||||
identifier = re.sub("/{2}", "http://", identifier)
|
identifier = re.sub("/{2}", "http://", identifier)
|
||||||
@ -70,15 +69,15 @@ class WikipediaParser(Source):
|
|||||||
else:
|
else:
|
||||||
request = Request(identifier)
|
request = Request(identifier)
|
||||||
log.msg('New identifier found, request: %s' % identifier, level=log.DEBUG)
|
log.msg('New identifier found, request: %s' % identifier, level=log.DEBUG)
|
||||||
itemlist.append(request)
|
item_list.append(request)
|
||||||
|
|
||||||
return itemlist
|
return item_list
|
||||||
|
|
||||||
def new_compound_request(self, compound):
|
def new_compound_request(self, compound):
|
||||||
return Request(url=self.website[:-1] + compound, callback=self.parse)
|
return Request(url=self.website[:-1] + compound, callback=self.parse)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def cleanitems(items):
|
def clean_items(items):
|
||||||
""" clean up properties using regex, makes it possible to split the values from the units """
|
""" clean up properties using regex, makes it possible to split the values from the units """
|
||||||
for item in items:
|
for item in items:
|
||||||
value = item['value']
|
value = item['value']
|
||||||
|
Reference in New Issue
Block a user