From d778050f36ee5bb469a4810d0afe959d8896b2e2 Mon Sep 17 00:00:00 2001 From: Bas Vb Date: Wed, 16 Apr 2014 10:37:57 +0200 Subject: [PATCH] Able to parse the weblinks to other databases, one example done --- FourmiCrawler/parsers/WikipediaParser.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/FourmiCrawler/parsers/WikipediaParser.py b/FourmiCrawler/parsers/WikipediaParser.py index 458c2fd..6d15819 100644 --- a/FourmiCrawler/parsers/WikipediaParser.py +++ b/FourmiCrawler/parsers/WikipediaParser.py @@ -23,7 +23,7 @@ class WikipediaParser(Parser): meltingpoint = self.getmeltingpoint(sel) items.append(meltingpoint) boilingpoint = self.getboilingpoint(sel) - print boilingpoint + chemlink = self.getchemspider(sel) items.append(boilingpoint) return items @@ -42,4 +42,9 @@ class WikipediaParser(Parser): item['attribute']="Boiling point" item['value']= sel.xpath('//tr/td/a[@title="Boiling point"]/../../td[2]/text()').extract() # ('//tr[contains(@href, "/wiki/Melting_point")]/text()').extract() item['source']= "Wikipedia" + return item + + def getchemspider(self, sel): + item=sel.xpath('//tr/td/a[@title="ChemSpider"]/../../td[2]/span/a/@href').extract() # ('//tr[contains(@href, "/wiki/Melting_point")]/text()').extract() + print item return item \ No newline at end of file