Cleaning up code
This commit is contained in:
parent
3e1b33164e
commit
62475d965d
@ -5,6 +5,7 @@ from scrapy.selector import Selector
|
|||||||
from FourmiCrawler.items import Result
|
from FourmiCrawler.items import Result
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
|
||||||
class WikipediaParser(Parser):
|
class WikipediaParser(Parser):
|
||||||
|
|
||||||
# General notes:
|
# General notes:
|
||||||
@ -15,8 +16,8 @@ class WikipediaParser(Parser):
|
|||||||
__spider = None
|
__spider = None
|
||||||
searched_compounds = []
|
searched_compounds = []
|
||||||
|
|
||||||
#def __init__(self, csid):
|
def __init__(self):
|
||||||
# self.website = "http://en.wikipedia.org/wiki/{id}".format(id=csid)
|
pass
|
||||||
|
|
||||||
def parse(self, response):
|
def parse(self, response):
|
||||||
print response.url
|
print response.url
|
||||||
@ -31,27 +32,29 @@ class WikipediaParser(Parser):
|
|||||||
return items
|
return items
|
||||||
|
|
||||||
def parse_infobox(self, sel):
|
def parse_infobox(self, sel):
|
||||||
items=[]
|
|
||||||
|
items = []
|
||||||
|
|
||||||
tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]').xpath('normalize-space(string())')
|
tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]').xpath('normalize-space(string())')
|
||||||
prop_names = tr_list[::2]
|
prop_names = tr_list[::2]
|
||||||
prop_values = tr_list[1::2]
|
prop_values = tr_list[1::2]
|
||||||
for i, prop_name in enumerate(prop_names):
|
for i, prop_name in enumerate(prop_names):
|
||||||
item = Result()
|
item = Result({
|
||||||
item['attribute'] = prop_name.extract().encode('utf-8')
|
'attribute': prop_name.extract().encode('utf-8'),
|
||||||
item['value'] = prop_values[i].extract().encode('utf-8')
|
'value': prop_values[i].extract().encode('utf-8'),
|
||||||
item['source'] = "Wikipedia"
|
'source': "Wikipedia",
|
||||||
item['reliability'] = ""
|
'reliability': "",
|
||||||
item['conditions'] = ""
|
'conditions': ""
|
||||||
|
})
|
||||||
items.append(item)
|
items.append(item)
|
||||||
log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG)
|
log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG)
|
||||||
items=filter(lambda a: a['value']!='', items) #remove items with an empty value
|
items = filter(lambda a: a['value'] != '', items) # remove items with an empty value
|
||||||
itemlist=self.cleanitems(items)
|
itemlist = self.cleanitems(items)
|
||||||
|
|
||||||
# request=Request(self.getchemspider(sel))
|
# request=Request(self.getchemspider(sel))
|
||||||
# itemlist.append(request)
|
# itemlist.append(request)
|
||||||
|
|
||||||
identifiers=self.get_identifiers(sel)
|
identifiers = self.get_identifiers(sel)
|
||||||
# print identifiers
|
# print identifiers
|
||||||
|
|
||||||
for i, identifier in enumerate(identifiers):
|
for i, identifier in enumerate(identifiers):
|
||||||
@ -70,23 +73,20 @@ class WikipediaParser(Parser):
|
|||||||
|
|
||||||
def cleanitems(self, items):
|
def cleanitems(self, items):
|
||||||
for item in items:
|
for item in items:
|
||||||
value=item['value']
|
value = item['value']
|
||||||
if re.search('F;\s(\d+[\.,]?\d*)', value):
|
m = re.search('F;\s(\d+[\.,]?\d*)', value)
|
||||||
#print re.search('F;\s(\d+[\.,]?\d*)', value).group(1)
|
if m:
|
||||||
item['value']=re.search('F;\s(\d+[\.,]?\d*)', value).group(1) + " K"
|
item['value'] = m.group(1) + " K"
|
||||||
if re.match('(\d+[\.,]?\d*)\sJ\sK.+mol', value):
|
m = re.match('(\d+[\.,]?\d*)\sJ\sK.+mol', value)
|
||||||
|
if m:
|
||||||
print item['value']
|
print item['value']
|
||||||
item['value']=re.search('(\d+[\.,]?\d*)\sJ\sK.+mol', value).group(1) + " J/K/mol"
|
item['value'] = m.group(1) + " J/K/mol"
|
||||||
print item['value']
|
print item['value']
|
||||||
return items
|
return items
|
||||||
|
|
||||||
def getchemspider(self, sel):
|
|
||||||
link=sel.xpath('//a[@title="ChemSpider"]/../../td[2]/span/a/@href').extract()[0] # ('//tr[contains(@href, "/wiki/Melting_point")]/text()').extract()
|
|
||||||
print link
|
|
||||||
return link
|
|
||||||
|
|
||||||
def get_identifiers(self, sel):
|
def get_identifiers(self, sel):
|
||||||
links=sel.xpath('//span[contains(concat(" ",normalize-space(@class)," "),"reflink")]/a[contains(concat(" ",normalize-space(@class)," "),"external")]/@href').extract()
|
links = sel.xpath('//span[contains(concat(" ",normalize-space(@class)," "),"reflink")]/a'
|
||||||
|
'[contains(concat(" ",normalize-space(@class)," "),"external")]/@href').extract()
|
||||||
|
|
||||||
print links
|
print links
|
||||||
return links
|
return links
|
Reference in New Issue
Block a user