Parser now adds extra requests for every identifier to an external source that is in the Wikipedia chembox
This commit is contained in:
parent
b5c83125f7
commit
1ced65e2b6
@ -47,8 +47,14 @@ class WikipediaParser(Parser):
|
|||||||
log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG)
|
log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG)
|
||||||
items=filter(lambda a: a['value']!='', items) #remove items with an empty value
|
items=filter(lambda a: a['value']!='', items) #remove items with an empty value
|
||||||
itemlist=self.cleanitems(items)
|
itemlist=self.cleanitems(items)
|
||||||
request=Request(self.getchemspider(sel))
|
|
||||||
itemlist.append(request)
|
# request=Request(self.getchemspider(sel))
|
||||||
|
# itemlist.append(request)
|
||||||
|
for identifier in self.get_identifiers(sel):
|
||||||
|
request_identifier=Request(identifier)
|
||||||
|
# print request_identifier
|
||||||
|
itemlist.append(request_identifier)
|
||||||
|
|
||||||
return itemlist
|
return itemlist
|
||||||
|
|
||||||
def new_compound_request(self, compound):
|
def new_compound_request(self, compound):
|
||||||
@ -67,6 +73,15 @@ class WikipediaParser(Parser):
|
|||||||
return items
|
return items
|
||||||
|
|
||||||
def getchemspider(self, sel):
|
def getchemspider(self, sel):
|
||||||
link=sel.xpath('//tr/td/a[@title="ChemSpider"]/../../td[2]/span/a/@href').extract()[0] # ('//tr[contains(@href, "/wiki/Melting_point")]/text()').extract()
|
link=sel.xpath('//a[@title="ChemSpider"]/../../td[2]/span/a/@href').extract()[0] # ('//tr[contains(@href, "/wiki/Melting_point")]/text()').extract()
|
||||||
print link
|
print link
|
||||||
return link
|
return link
|
||||||
|
|
||||||
|
def get_identifiers(self, sel):
|
||||||
|
links=sel.xpath('//span[contains(concat(" ",normalize-space(@class)," "),"reflink")]/a[contains(concat(" ",normalize-space(@class)," "),"external")]/@href').extract()
|
||||||
|
# identifiers=[]
|
||||||
|
# for link in links:
|
||||||
|
# identifier=Request(link)
|
||||||
|
# identifiers.append(identifier)
|
||||||
|
# print identifiers
|
||||||
|
return links
|
Reference in New Issue
Block a user