From ba8f8451786088c12b4645f61261ab4e8d96598b Mon Sep 17 00:00:00 2001 From: Nout van Deijck Date: Mon, 2 Jun 2014 09:26:36 +0200 Subject: [PATCH] now also (finally) scrapes property values and names, but not yet coupled together and not yet returned. --- FourmiCrawler/sources/PubChem.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py index e2dcc8b..6718900 100644 --- a/FourmiCrawler/sources/PubChem.py +++ b/FourmiCrawler/sources/PubChem.py @@ -60,12 +60,20 @@ class PubChem(Source): log.msg('parsing data', level=log.DEBUG) requests = [] + sel = Selector(response) + # props = sel.xpath('.//div') + prop_values = sel.xpath('//div//a/text()').extract() + prop_names = sel.xpath('//div//a/ancestor::div/b/text()').extract() + print prop_values + print prop_names + # print props return requests + # this (old) definition is only here to help myself def parse_properties(self, sel): """ scrape data from 'Chemical and Physical Properties' box on PubChem. """ items = [] @@ -95,9 +103,9 @@ class PubChem(Source): items = filter(lambda a: a['value'] != '', items) # remove items with an empty value # item_list = self.clean_items(items) - return items + def new_compound_request(self, compound): return Request(url=self.website_www[:-1] + self.search % compound, callback=self.parse)