Archived
1
0

Cleaning up code

This commit is contained in:
Bas Vb 2014-04-23 15:24:57 +02:00
parent 3e1b33164e
commit 62475d965d

View File

@ -5,6 +5,7 @@ from scrapy.selector import Selector
from FourmiCrawler.items import Result from FourmiCrawler.items import Result
import re import re
class WikipediaParser(Parser): class WikipediaParser(Parser):
# General notes: # General notes:
@ -15,8 +16,8 @@ class WikipediaParser(Parser):
__spider = None __spider = None
searched_compounds = [] searched_compounds = []
#def __init__(self, csid): def __init__(self):
# self.website = "http://en.wikipedia.org/wiki/{id}".format(id=csid) pass
def parse(self, response): def parse(self, response):
print response.url print response.url
@ -31,27 +32,29 @@ class WikipediaParser(Parser):
return items return items
def parse_infobox(self, sel): def parse_infobox(self, sel):
items=[]
items = []
tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]').xpath('normalize-space(string())') tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]').xpath('normalize-space(string())')
prop_names = tr_list[::2] prop_names = tr_list[::2]
prop_values = tr_list[1::2] prop_values = tr_list[1::2]
for i, prop_name in enumerate(prop_names): for i, prop_name in enumerate(prop_names):
item = Result() item = Result({
item['attribute'] = prop_name.extract().encode('utf-8') 'attribute': prop_name.extract().encode('utf-8'),
item['value'] = prop_values[i].extract().encode('utf-8') 'value': prop_values[i].extract().encode('utf-8'),
item['source'] = "Wikipedia" 'source': "Wikipedia",
item['reliability'] = "" 'reliability': "",
item['conditions'] = "" 'conditions': ""
})
items.append(item) items.append(item)
log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG) log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG)
items=filter(lambda a: a['value']!='', items) #remove items with an empty value items = filter(lambda a: a['value'] != '', items) # remove items with an empty value
itemlist=self.cleanitems(items) itemlist = self.cleanitems(items)
# request=Request(self.getchemspider(sel)) # request=Request(self.getchemspider(sel))
# itemlist.append(request) # itemlist.append(request)
identifiers=self.get_identifiers(sel) identifiers = self.get_identifiers(sel)
# print identifiers # print identifiers
for i, identifier in enumerate(identifiers): for i, identifier in enumerate(identifiers):
@ -70,23 +73,20 @@ class WikipediaParser(Parser):
def cleanitems(self, items): def cleanitems(self, items):
for item in items: for item in items:
value=item['value'] value = item['value']
if re.search('F;\s(\d+[\.,]?\d*)', value): m = re.search('F;\s(\d+[\.,]?\d*)', value)
#print re.search('F;\s(\d+[\.,]?\d*)', value).group(1) if m:
item['value']=re.search('F;\s(\d+[\.,]?\d*)', value).group(1) + " K" item['value'] = m.group(1) + " K"
if re.match('(\d+[\.,]?\d*)\sJ\sK.+mol', value): m = re.match('(\d+[\.,]?\d*)\sJ\sK.+mol', value)
if m:
print item['value'] print item['value']
item['value']=re.search('(\d+[\.,]?\d*)\sJ\sK.+mol', value).group(1) + " J/K/mol" item['value'] = m.group(1) + " J/K/mol"
print item['value'] print item['value']
return items return items
def getchemspider(self, sel):
link=sel.xpath('//a[@title="ChemSpider"]/../../td[2]/span/a/@href').extract()[0] # ('//tr[contains(@href, "/wiki/Melting_point")]/text()').extract()
print link
return link
def get_identifiers(self, sel): def get_identifiers(self, sel):
links=sel.xpath('//span[contains(concat(" ",normalize-space(@class)," "),"reflink")]/a[contains(concat(" ",normalize-space(@class)," "),"external")]/@href').extract() links = sel.xpath('//span[contains(concat(" ",normalize-space(@class)," "),"reflink")]/a'
'[contains(concat(" ",normalize-space(@class)," "),"external")]/@href').extract()
print links print links
return links return links