From ce3105f3c1fff2ae3cc53bd45c046666745749be Mon Sep 17 00:00:00 2001 From: Bas Vb Date: Wed, 16 Apr 2014 14:56:32 +0200 Subject: [PATCH] went to a general loop over all values, this way getting all elements from the Wikipedia infobox (except for those with a colspan, because these mess up) --- FourmiCrawler/parsers/WikipediaParser.py | 28 ++++++++++++++---------- 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/FourmiCrawler/parsers/WikipediaParser.py b/FourmiCrawler/parsers/WikipediaParser.py index f5903c6..90eca6c 100644 --- a/FourmiCrawler/parsers/WikipediaParser.py +++ b/FourmiCrawler/parsers/WikipediaParser.py @@ -19,18 +19,22 @@ class WikipediaParser(Parser): print response.url #self.log('A response from %s just arrived!' % response.url) sel = Selector(response) - items = [] - density = self.getdensity(sel) - items.append(density) - meltingpoint = self.getmeltingpoint(sel) - items.append(meltingpoint) - boilingpoint = self.getboilingpoint(sel) - chemlink = self.getchemspider(sel) - items.append(boilingpoint) - heatcapacity = self.getheatcapacity(sel) - items.append(heatcapacity) - molarentropy = self.getmolarentropy(sel) - items.append(molarentropy) + items = self.parse_infobox(sel) + return items + + def parse_infobox(self, sel): + items=[] + tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]').xpath('normalize-space(string())') + prop_names = tr_list[::2] + prop_values = tr_list[1::2] + for i, prop_name in enumerate(prop_names): + item = Result() + item['attribute'] = prop_name.extract().encode('utf-8') + item['value'] = prop_values[i].extract().encode('utf-8') + item['source'] = "Wikipedia" + items.append(item) + print "new: " + item['attribute'] + print item['value'] return items def new_compound_request(self, compound):