now also (finally) scrapes property values and names, but not yet coupled together and not yet returned.
This commit is contained in:
parent
8083d0c7bc
commit
ba8f845178
@ -60,12 +60,20 @@ class PubChem(Source):
|
|||||||
log.msg('parsing data', level=log.DEBUG)
|
log.msg('parsing data', level=log.DEBUG)
|
||||||
requests = []
|
requests = []
|
||||||
|
|
||||||
|
sel = Selector(response)
|
||||||
|
# props = sel.xpath('.//div')
|
||||||
|
prop_values = sel.xpath('//div//a/text()').extract()
|
||||||
|
prop_names = sel.xpath('//div//a/ancestor::div/b/text()').extract()
|
||||||
|
|
||||||
|
print prop_values
|
||||||
|
print prop_names
|
||||||
|
|
||||||
|
# print props
|
||||||
|
|
||||||
return requests
|
return requests
|
||||||
|
|
||||||
|
|
||||||
|
# this (old) definition is only here to help myself
|
||||||
def parse_properties(self, sel):
|
def parse_properties(self, sel):
|
||||||
""" scrape data from 'Chemical and Physical Properties' box on PubChem. """
|
""" scrape data from 'Chemical and Physical Properties' box on PubChem. """
|
||||||
items = []
|
items = []
|
||||||
@ -95,9 +103,9 @@ class PubChem(Source):
|
|||||||
items = filter(lambda a: a['value'] != '', items) # remove items with an empty value
|
items = filter(lambda a: a['value'] != '', items) # remove items with an empty value
|
||||||
# item_list = self.clean_items(items)
|
# item_list = self.clean_items(items)
|
||||||
|
|
||||||
|
|
||||||
return items
|
return items
|
||||||
|
|
||||||
|
|
||||||
def new_compound_request(self, compound):
|
def new_compound_request(self, compound):
|
||||||
return Request(url=self.website_www[:-1] + self.search % compound, callback=self.parse)
|
return Request(url=self.website_www[:-1] + self.search % compound, callback=self.parse)
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user