From 291547a5addfb5f79dd8bcc0cb80c798f20f05db Mon Sep 17 00:00:00 2001 From: Nout van Deijck Date: Wed, 4 Jun 2014 15:44:53 +0200 Subject: [PATCH] now returns good results, with property values and corresponding sources --- FourmiCrawler/sources/PubChem.py | 34 +++++++++++++++++++++++++------- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py index 6718900..1d20231 100644 --- a/FourmiCrawler/sources/PubChem.py +++ b/FourmiCrawler/sources/PubChem.py @@ -61,14 +61,34 @@ class PubChem(Source): requests = [] sel = Selector(response) - # props = sel.xpath('.//div') - prop_values = sel.xpath('//div//a/text()').extract() - prop_names = sel.xpath('//div//a/ancestor::div/b/text()').extract() + props = sel.xpath('//div') - print prop_values - print prop_names - - # print props + for prop in props: + prop_name = ''.join(prop.xpath('b/text()').extract()) + if prop.xpath('a'): + prop_source = ''.join(prop.xpath('a/@title').extract()) + prop_value = ''.join(prop.xpath('a/text()').extract()) + new_prop = Result({ + 'attribute': prop_name, + 'value': prop_value, + 'source': prop_source, + 'reliability': 'Unknown', + 'conditions': '' + }) + requests.append(new_prop) + elif prop.xpath('ul'): + prop_values = prop.xpath('ul//li') + for prop_li in prop_values: + prop_value = ''.join(prop_li.xpath('a/text()').extract()) + prop_source = ''.join(prop_li.xpath('a/@title').extract()) + new_prop = Result({ + 'attribute': prop_name, + 'value': prop_value, + 'source': prop_source, + 'reliability': 'Unknown', + 'conditions': '' + }) + requests.append(new_prop) return requests