From b4ff4a3c3b8f892efbc98306e5620e8f65a520e2 Mon Sep 17 00:00:00 2001 From: Bas Vb Date: Thu, 3 Apr 2014 12:00:27 +0200 Subject: [PATCH 01/34] New file and branch for the Wikipedia parser --- FourmiCrawler/parsers/WikipediaParser.py | 1 + 1 file changed, 1 insertion(+) create mode 100644 FourmiCrawler/parsers/WikipediaParser.py diff --git a/FourmiCrawler/parsers/WikipediaParser.py b/FourmiCrawler/parsers/WikipediaParser.py new file mode 100644 index 0000000..460a205 --- /dev/null +++ b/FourmiCrawler/parsers/WikipediaParser.py @@ -0,0 +1 @@ +__author__ = 'Bas' From 60c409da3da8cb1b5f129eea064c475efcaff7dd Mon Sep 17 00:00:00 2001 From: Bas Vb Date: Thu, 3 Apr 2014 12:05:06 +0200 Subject: [PATCH 02/34] New file and branch for the Wikipedia parser --- FourmiCrawler/parsers/WikipediaParser.py | 1 + 1 file changed, 1 insertion(+) diff --git a/FourmiCrawler/parsers/WikipediaParser.py b/FourmiCrawler/parsers/WikipediaParser.py index 460a205..dd73ce0 100644 --- a/FourmiCrawler/parsers/WikipediaParser.py +++ b/FourmiCrawler/parsers/WikipediaParser.py @@ -1 +1,2 @@ __author__ = 'Bas' +#new branch \ No newline at end of file From 81a93c44bba38e911dd932ff4c250b08c9f37379 Mon Sep 17 00:00:00 2001 From: Nout van Deijck Date: Thu, 3 Apr 2014 12:19:17 +0200 Subject: [PATCH 03/34] added author --- FourmiCrawler/parsers/WikipediaParser.py | 1 + 1 file changed, 1 insertion(+) diff --git a/FourmiCrawler/parsers/WikipediaParser.py b/FourmiCrawler/parsers/WikipediaParser.py index dd73ce0..d88f4f1 100644 --- a/FourmiCrawler/parsers/WikipediaParser.py +++ b/FourmiCrawler/parsers/WikipediaParser.py @@ -1,2 +1,3 @@ __author__ = 'Bas' +__author__ = 'Nout' #new branch \ No newline at end of file From add4a13a4db4e5875e66fc8c179ab06c084abe0f Mon Sep 17 00:00:00 2001 From: Bas Vb Date: Sun, 6 Apr 2014 18:02:09 +0200 Subject: [PATCH 04/34] Trying to make a start with the WikipediaParser, but I can't find out with the Scrapy website (or another way) what the structure of the file should be, and how I can test/run the crawling on a page. --- FourmiCrawler/parsers/WikipediaParser.py | 33 +++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/FourmiCrawler/parsers/WikipediaParser.py b/FourmiCrawler/parsers/WikipediaParser.py index d88f4f1..b3dc36f 100644 --- a/FourmiCrawler/parsers/WikipediaParser.py +++ b/FourmiCrawler/parsers/WikipediaParser.py @@ -1,3 +1,30 @@ -__author__ = 'Bas' -__author__ = 'Nout' -#new branch \ No newline at end of file +import parser +from scrapy.selector import Selector +from FourmiCrawler.items import Result + +class WikipediaParser: + + website = "http://en.wikipedia.org/wiki/Methane" + __spider = "WikipediaParser" + + + #def __init__(self, csid): + # self.website = "http://en.wikipedia.org/wiki/{id}".format(id=csid) + + #def parse(self, response): + #self.log('A response from %s just arrived!' % response.url) + def parse(): + sel = Selector("http://en.wikipedia.org/wiki/Methane") + items = [] + item = Result() + item['attribute']="Melting point" + item['value']=site.xpath('//tr[contains(@href, "/wiki/Melting_point")]/text()').extract() + item['source']= self.website + items.append(item) + print item['attribute'] + print item['value'] + print item['source'] + print "test" + return items + + parse() \ No newline at end of file From f3807c3018e0cfea2a76e86953076bd84b3ab7d0 Mon Sep 17 00:00:00 2001 From: Bas Vb Date: Sun, 6 Apr 2014 20:28:03 +0200 Subject: [PATCH 05/34] Fixed the errors, but still not able to run/test the parse() function --- FourmiCrawler/parsers/WikipediaParser.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/FourmiCrawler/parsers/WikipediaParser.py b/FourmiCrawler/parsers/WikipediaParser.py index b3dc36f..cd0a2eb 100644 --- a/FourmiCrawler/parsers/WikipediaParser.py +++ b/FourmiCrawler/parsers/WikipediaParser.py @@ -7,13 +7,14 @@ class WikipediaParser: website = "http://en.wikipedia.org/wiki/Methane" __spider = "WikipediaParser" - + print "test1" #def __init__(self, csid): # self.website = "http://en.wikipedia.org/wiki/{id}".format(id=csid) - #def parse(self, response): + def parse(self, response): + print "test1" #self.log('A response from %s just arrived!' % response.url) - def parse(): + #def parse(): sel = Selector("http://en.wikipedia.org/wiki/Methane") items = [] item = Result() @@ -26,5 +27,3 @@ class WikipediaParser: print item['source'] print "test" return items - - parse() \ No newline at end of file From 4b0c4acf96992a486277a14185615466e8938e6a Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Tue, 8 Apr 2014 11:40:30 +0200 Subject: [PATCH 06/34] Updated the wikipedia parser as an rightful subclass of Parser --- FourmiCrawler/parsers/WikipediaParser.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/FourmiCrawler/parsers/WikipediaParser.py b/FourmiCrawler/parsers/WikipediaParser.py index cd0a2eb..6709472 100644 --- a/FourmiCrawler/parsers/WikipediaParser.py +++ b/FourmiCrawler/parsers/WikipediaParser.py @@ -1,11 +1,12 @@ -import parser +from scrapy.http import Request +from parser import Parser from scrapy.selector import Selector from FourmiCrawler.items import Result -class WikipediaParser: +class WikipediaParser(Parser): - website = "http://en.wikipedia.org/wiki/Methane" - __spider = "WikipediaParser" + website = "http://en.wikipedia.org/wiki/*" + __spider = None print "test1" #def __init__(self, csid): @@ -27,3 +28,6 @@ class WikipediaParser: print item['source'] print "test" return items + + def new_compound_request(self, compound): + return Request(url=self.website[:-1] + compound, callback=self.parse) \ No newline at end of file From f9799c30d8d7a887454b9685bea7a37a5a0c4486 Mon Sep 17 00:00:00 2001 From: Bas Vb Date: Tue, 8 Apr 2014 14:59:09 +0200 Subject: [PATCH 07/34] Parse is runnable now. --- FourmiCrawler/parsers/WikipediaParser.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/FourmiCrawler/parsers/WikipediaParser.py b/FourmiCrawler/parsers/WikipediaParser.py index 6709472..724fb79 100644 --- a/FourmiCrawler/parsers/WikipediaParser.py +++ b/FourmiCrawler/parsers/WikipediaParser.py @@ -8,20 +8,18 @@ class WikipediaParser(Parser): website = "http://en.wikipedia.org/wiki/*" __spider = None - print "test1" #def __init__(self, csid): # self.website = "http://en.wikipedia.org/wiki/{id}".format(id=csid) def parse(self, response): - print "test1" + print response.url #self.log('A response from %s just arrived!' % response.url) - #def parse(): - sel = Selector("http://en.wikipedia.org/wiki/Methane") + sel = Selector(response) items = [] item = Result() item['attribute']="Melting point" - item['value']=site.xpath('//tr[contains(@href, "/wiki/Melting_point")]/text()').extract() - item['source']= self.website + item['value']="value1" # sel.xpath('//tr[contains(@href, "/wiki/Melting_point")]/text()').extract() + item['source']= "Wikipedia" items.append(item) print item['attribute'] print item['value'] From 91ed053ac5f2349771476d0dfc329389e0334ad3 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Tue, 15 Apr 2014 18:17:35 +0200 Subject: [PATCH 08/34] Stopped log from interfering with STDOUT --- Fourmi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Fourmi.py b/Fourmi.py index 7c3cf7d..2f8bd1b 100755 --- a/Fourmi.py +++ b/Fourmi.py @@ -41,7 +41,7 @@ def setup_crawler(searchables): def start(): setup_crawler(["Methane"]) - log.start() + log.start(logstdout=False) reactor.run() From 1ca3593ae1c85f4dcdb758ec84e4714f15f927cb Mon Sep 17 00:00:00 2001 From: Bas Vb Date: Wed, 16 Apr 2014 00:35:19 +0200 Subject: [PATCH 09/34] Parse is runnable now. --- FourmiCrawler/parsers/WikipediaParser.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/FourmiCrawler/parsers/WikipediaParser.py b/FourmiCrawler/parsers/WikipediaParser.py index 724fb79..784fccf 100644 --- a/FourmiCrawler/parsers/WikipediaParser.py +++ b/FourmiCrawler/parsers/WikipediaParser.py @@ -18,14 +18,20 @@ class WikipediaParser(Parser): items = [] item = Result() item['attribute']="Melting point" - item['value']="value1" # sel.xpath('//tr[contains(@href, "/wiki/Melting_point")]/text()').extract() + item['value']= sel.xpath('//tr/td/a[@title="Melting point"]/../../td[2]/text()').extract() # ('//tr[contains(@href, "/wiki/Melting_point")]/text()').extract() item['source']= "Wikipedia" items.append(item) print item['attribute'] print item['value'] print item['source'] - print "test" return items + def getmeltingpoint(self, sel): + item=Result() + item['attribute']="Melting point" + item['value']= sel.xpath('//tr/td/a[@title="Melting point"]/../../td[2]/text()').extract() # ('//tr[contains(@href, "/wiki/Melting_point")]/text()').extract() + item['source']= "Wikipedia" + return item + def new_compound_request(self, compound): return Request(url=self.website[:-1] + compound, callback=self.parse) \ No newline at end of file From cd1637b0fe4d82d7e96cc43f035a9a282fde2c4c Mon Sep 17 00:00:00 2001 From: Bas Vb Date: Wed, 16 Apr 2014 00:50:50 +0200 Subject: [PATCH 10/34] Both Boiling point and melting point are now parsed from chemical Wikipedia pages, there's one error about different types of attributes in the Result-items, this needs to be fixed by cleaning up the retrieved data. --- FourmiCrawler/parsers/WikipediaParser.py | 28 +++++++++++++++--------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/FourmiCrawler/parsers/WikipediaParser.py b/FourmiCrawler/parsers/WikipediaParser.py index 784fccf..458c2fd 100644 --- a/FourmiCrawler/parsers/WikipediaParser.py +++ b/FourmiCrawler/parsers/WikipediaParser.py @@ -5,6 +5,10 @@ from FourmiCrawler.items import Result class WikipediaParser(Parser): +# General notes: +# Redirects seem to not matter as Wikipedia returns the page the redirect forwards to +# although this might lead to scraping both the original and the redirect with the same data. + website = "http://en.wikipedia.org/wiki/*" __spider = None @@ -16,16 +20,16 @@ class WikipediaParser(Parser): #self.log('A response from %s just arrived!' % response.url) sel = Selector(response) items = [] - item = Result() - item['attribute']="Melting point" - item['value']= sel.xpath('//tr/td/a[@title="Melting point"]/../../td[2]/text()').extract() # ('//tr[contains(@href, "/wiki/Melting_point")]/text()').extract() - item['source']= "Wikipedia" - items.append(item) - print item['attribute'] - print item['value'] - print item['source'] + meltingpoint = self.getmeltingpoint(sel) + items.append(meltingpoint) + boilingpoint = self.getboilingpoint(sel) + print boilingpoint + items.append(boilingpoint) return items + def new_compound_request(self, compound): + return Request(url=self.website[:-1] + compound, callback=self.parse) + def getmeltingpoint(self, sel): item=Result() item['attribute']="Melting point" @@ -33,5 +37,9 @@ class WikipediaParser(Parser): item['source']= "Wikipedia" return item - def new_compound_request(self, compound): - return Request(url=self.website[:-1] + compound, callback=self.parse) \ No newline at end of file + def getboilingpoint(self, sel): + item=Result() + item['attribute']="Boiling point" + item['value']= sel.xpath('//tr/td/a[@title="Boiling point"]/../../td[2]/text()').extract() # ('//tr[contains(@href, "/wiki/Melting_point")]/text()').extract() + item['source']= "Wikipedia" + return item \ No newline at end of file From d778050f36ee5bb469a4810d0afe959d8896b2e2 Mon Sep 17 00:00:00 2001 From: Bas Vb Date: Wed, 16 Apr 2014 10:37:57 +0200 Subject: [PATCH 11/34] Able to parse the weblinks to other databases, one example done --- FourmiCrawler/parsers/WikipediaParser.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/FourmiCrawler/parsers/WikipediaParser.py b/FourmiCrawler/parsers/WikipediaParser.py index 458c2fd..6d15819 100644 --- a/FourmiCrawler/parsers/WikipediaParser.py +++ b/FourmiCrawler/parsers/WikipediaParser.py @@ -23,7 +23,7 @@ class WikipediaParser(Parser): meltingpoint = self.getmeltingpoint(sel) items.append(meltingpoint) boilingpoint = self.getboilingpoint(sel) - print boilingpoint + chemlink = self.getchemspider(sel) items.append(boilingpoint) return items @@ -42,4 +42,9 @@ class WikipediaParser(Parser): item['attribute']="Boiling point" item['value']= sel.xpath('//tr/td/a[@title="Boiling point"]/../../td[2]/text()').extract() # ('//tr[contains(@href, "/wiki/Melting_point")]/text()').extract() item['source']= "Wikipedia" + return item + + def getchemspider(self, sel): + item=sel.xpath('//tr/td/a[@title="ChemSpider"]/../../td[2]/span/a/@href').extract() # ('//tr[contains(@href, "/wiki/Melting_point")]/text()').extract() + print item return item \ No newline at end of file From d99548e3b6cac32026b962e020ccf5d90687c929 Mon Sep 17 00:00:00 2001 From: Bas Vb Date: Wed, 16 Apr 2014 11:14:02 +0200 Subject: [PATCH 12/34] Added density, molar entropy and heat capacity --- FourmiCrawler/parsers/WikipediaParser.py | 36 ++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/FourmiCrawler/parsers/WikipediaParser.py b/FourmiCrawler/parsers/WikipediaParser.py index 6d15819..3276a87 100644 --- a/FourmiCrawler/parsers/WikipediaParser.py +++ b/FourmiCrawler/parsers/WikipediaParser.py @@ -20,11 +20,17 @@ class WikipediaParser(Parser): #self.log('A response from %s just arrived!' % response.url) sel = Selector(response) items = [] + density = self.getdensity(sel) + items.append(density) meltingpoint = self.getmeltingpoint(sel) items.append(meltingpoint) boilingpoint = self.getboilingpoint(sel) chemlink = self.getchemspider(sel) items.append(boilingpoint) + heatcapacity = self.getheatcapacity(sel) + items.append(heatcapacity) + molarentropy = self.getmolarentropy(sel) + items.append(molarentropy) return items def new_compound_request(self, compound): @@ -44,7 +50,31 @@ class WikipediaParser(Parser): item['source']= "Wikipedia" return item + def getdensity(self, sel): + item=Result() + item['attribute']="Density" + item['value']= sel.xpath('//tr/td/a[@title="Density"]/../../td[2]/text()').extract() # ('//tr[contains(@href, "/wiki/Melting_point")]/text()').extract() + item['source']= "Wikipedia" + print item['value'] + return item + + def getheatcapacity(self, sel): + item=Result() + item['attribute']="Specific heat capacity" + item['value']= sel.xpath('//tr/td/a[@title="Specific heat capacity"]/../../td[2]/text()').extract() # ('//tr[contains(@href, "/wiki/Melting_point")]/text()').extract() + item['source']= "Wikipedia" + print item['value'] + return item + + def getmolarentropy(self, sel): + item=Result() + item['attribute']="Standard molar entropy" + item['value']= sel.xpath('//tr/td/a[@title="Standard molar entropy"]/../../td[2]/text()').extract() # ('//tr[contains(@href, "/wiki/Melting_point")]/text()').extract() + item['source']= "Wikipedia" + print item['value'] + return item + def getchemspider(self, sel): - item=sel.xpath('//tr/td/a[@title="ChemSpider"]/../../td[2]/span/a/@href').extract() # ('//tr[contains(@href, "/wiki/Melting_point")]/text()').extract() - print item - return item \ No newline at end of file + link=sel.xpath('//tr/td/a[@title="ChemSpider"]/../../td[2]/span/a/@href').extract() # ('//tr[contains(@href, "/wiki/Melting_point")]/text()').extract() + print link + return link \ No newline at end of file From f1280dd66d49ce08db9580cfa7554d7ca922846c Mon Sep 17 00:00:00 2001 From: Bas Vb Date: Wed, 16 Apr 2014 13:23:50 +0200 Subject: [PATCH 13/34] get value not list from xpath --- FourmiCrawler/parsers/WikipediaParser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/FourmiCrawler/parsers/WikipediaParser.py b/FourmiCrawler/parsers/WikipediaParser.py index 3276a87..f5903c6 100644 --- a/FourmiCrawler/parsers/WikipediaParser.py +++ b/FourmiCrawler/parsers/WikipediaParser.py @@ -75,6 +75,6 @@ class WikipediaParser(Parser): return item def getchemspider(self, sel): - link=sel.xpath('//tr/td/a[@title="ChemSpider"]/../../td[2]/span/a/@href').extract() # ('//tr[contains(@href, "/wiki/Melting_point")]/text()').extract() + link=sel.xpath('//tr/td/a[@title="ChemSpider"]/../../td[2]/span/a/@href').extract()[0] # ('//tr[contains(@href, "/wiki/Melting_point")]/text()').extract() print link return link \ No newline at end of file From ce3105f3c1fff2ae3cc53bd45c046666745749be Mon Sep 17 00:00:00 2001 From: Bas Vb Date: Wed, 16 Apr 2014 14:56:32 +0200 Subject: [PATCH 14/34] went to a general loop over all values, this way getting all elements from the Wikipedia infobox (except for those with a colspan, because these mess up) --- FourmiCrawler/parsers/WikipediaParser.py | 28 ++++++++++++++---------- 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/FourmiCrawler/parsers/WikipediaParser.py b/FourmiCrawler/parsers/WikipediaParser.py index f5903c6..90eca6c 100644 --- a/FourmiCrawler/parsers/WikipediaParser.py +++ b/FourmiCrawler/parsers/WikipediaParser.py @@ -19,18 +19,22 @@ class WikipediaParser(Parser): print response.url #self.log('A response from %s just arrived!' % response.url) sel = Selector(response) - items = [] - density = self.getdensity(sel) - items.append(density) - meltingpoint = self.getmeltingpoint(sel) - items.append(meltingpoint) - boilingpoint = self.getboilingpoint(sel) - chemlink = self.getchemspider(sel) - items.append(boilingpoint) - heatcapacity = self.getheatcapacity(sel) - items.append(heatcapacity) - molarentropy = self.getmolarentropy(sel) - items.append(molarentropy) + items = self.parse_infobox(sel) + return items + + def parse_infobox(self, sel): + items=[] + tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]').xpath('normalize-space(string())') + prop_names = tr_list[::2] + prop_values = tr_list[1::2] + for i, prop_name in enumerate(prop_names): + item = Result() + item['attribute'] = prop_name.extract().encode('utf-8') + item['value'] = prop_values[i].extract().encode('utf-8') + item['source'] = "Wikipedia" + items.append(item) + print "new: " + item['attribute'] + print item['value'] return items def new_compound_request(self, compound): From 34c3a8b4d66cd5d80770a571768d7e7a5a96352c Mon Sep 17 00:00:00 2001 From: Bas Vb Date: Wed, 16 Apr 2014 15:22:47 +0200 Subject: [PATCH 15/34] remove empty data points --- FourmiCrawler/parsers/WikipediaParser.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/FourmiCrawler/parsers/WikipediaParser.py b/FourmiCrawler/parsers/WikipediaParser.py index 90eca6c..c489424 100644 --- a/FourmiCrawler/parsers/WikipediaParser.py +++ b/FourmiCrawler/parsers/WikipediaParser.py @@ -33,10 +33,13 @@ class WikipediaParser(Parser): item['value'] = prop_values[i].extract().encode('utf-8') item['source'] = "Wikipedia" items.append(item) - print "new: " + item['attribute'] - print item['value'] + #print "new: " + item['attribute'] + #print item['value'] + items=filter(lambda a: a['value']!='', items) + print items return items + def new_compound_request(self, compound): return Request(url=self.website[:-1] + compound, callback=self.parse) From 74aa446f40e6050b3e0b333b21921d84462bdf0c Mon Sep 17 00:00:00 2001 From: Bas Vb Date: Wed, 16 Apr 2014 15:27:36 +0200 Subject: [PATCH 16/34] minor edits (comments etc.) --- FourmiCrawler/parsers/WikipediaParser.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/FourmiCrawler/parsers/WikipediaParser.py b/FourmiCrawler/parsers/WikipediaParser.py index c489424..810f3a1 100644 --- a/FourmiCrawler/parsers/WikipediaParser.py +++ b/FourmiCrawler/parsers/WikipediaParser.py @@ -33,13 +33,12 @@ class WikipediaParser(Parser): item['value'] = prop_values[i].extract().encode('utf-8') item['source'] = "Wikipedia" items.append(item) - #print "new: " + item['attribute'] - #print item['value'] - items=filter(lambda a: a['value']!='', items) + print "new: " + item['attribute'] + print item['value'] + items=filter(lambda a: a['value']!='', items) #remove items with an empty value print items return items - def new_compound_request(self, compound): return Request(url=self.website[:-1] + compound, callback=self.parse) From 6f82b117c9b2000a293678ee8928082969d0311f Mon Sep 17 00:00:00 2001 From: Bas Vb Date: Wed, 16 Apr 2014 16:23:33 +0200 Subject: [PATCH 17/34] new function to clean up the datapoints --- FourmiCrawler/parsers/WikipediaParser.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/FourmiCrawler/parsers/WikipediaParser.py b/FourmiCrawler/parsers/WikipediaParser.py index 810f3a1..1ceeccb 100644 --- a/FourmiCrawler/parsers/WikipediaParser.py +++ b/FourmiCrawler/parsers/WikipediaParser.py @@ -33,21 +33,20 @@ class WikipediaParser(Parser): item['value'] = prop_values[i].extract().encode('utf-8') item['source'] = "Wikipedia" items.append(item) - print "new: " + item['attribute'] - print item['value'] + #print "new: " + item['attribute'] + #print item['value'] items=filter(lambda a: a['value']!='', items) #remove items with an empty value - print items + #print items + self.cleanitems(items) return items def new_compound_request(self, compound): return Request(url=self.website[:-1] + compound, callback=self.parse) - def getmeltingpoint(self, sel): - item=Result() - item['attribute']="Melting point" - item['value']= sel.xpath('//tr/td/a[@title="Melting point"]/../../td[2]/text()').extract() # ('//tr[contains(@href, "/wiki/Melting_point")]/text()').extract() - item['source']= "Wikipedia" - return item + def cleanitems(self, items): + for item in items: + print item['value'] + return items def getboilingpoint(self, sel): item=Result() From be63315ca2c28208cd7657535c6b23f001e8a808 Mon Sep 17 00:00:00 2001 From: Bas Vb Date: Wed, 16 Apr 2014 17:01:35 +0200 Subject: [PATCH 18/34] regex --- FourmiCrawler/parsers/WikipediaParser.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/FourmiCrawler/parsers/WikipediaParser.py b/FourmiCrawler/parsers/WikipediaParser.py index 1ceeccb..38d42f8 100644 --- a/FourmiCrawler/parsers/WikipediaParser.py +++ b/FourmiCrawler/parsers/WikipediaParser.py @@ -2,6 +2,7 @@ from scrapy.http import Request from parser import Parser from scrapy.selector import Selector from FourmiCrawler.items import Result +import re class WikipediaParser(Parser): @@ -45,7 +46,9 @@ class WikipediaParser(Parser): def cleanitems(self, items): for item in items: - print item['value'] + value=item['value'] + if re.match('3(...)', value): + print value return items def getboilingpoint(self, sel): From b0146cdce8a8f90919c020f178fbb821b1ab61bc Mon Sep 17 00:00:00 2001 From: Bas Vb Date: Tue, 22 Apr 2014 09:46:19 +0200 Subject: [PATCH 19/34] Added regular expressions to clean up temperature data --- FourmiCrawler/parsers/WikipediaParser.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/FourmiCrawler/parsers/WikipediaParser.py b/FourmiCrawler/parsers/WikipediaParser.py index 38d42f8..07263a5 100644 --- a/FourmiCrawler/parsers/WikipediaParser.py +++ b/FourmiCrawler/parsers/WikipediaParser.py @@ -47,8 +47,13 @@ class WikipediaParser(Parser): def cleanitems(self, items): for item in items: value=item['value'] - if re.match('3(...)', value): - print value + if re.search('F;\s(\d+[\.,]?\d*)', value): + #print re.search('F;\s(\d+[\.,]?\d*)', value).group(1) + item['value']=re.search('F;\s(\d+[\.,]?\d*)', value).group(1) + " K" + if re.match('(\d+[\.,]?\d*)\sJ\sK.+mol', value): + print item['value'] + item['value']=re.search('(\d+[\.,]?\d*)\sJ\sK.+mol', value).group(1) + " J/K/mol" + print item['value'] return items def getboilingpoint(self, sel): From 1c518af5a615e5bee786f97e476604ef46d09891 Mon Sep 17 00:00:00 2001 From: Bas Vb Date: Wed, 23 Apr 2014 11:06:59 +0200 Subject: [PATCH 20/34] Remove per attribute getfunctions --- FourmiCrawler/parsers/WikipediaParser.py | 31 ------------------------ 1 file changed, 31 deletions(-) diff --git a/FourmiCrawler/parsers/WikipediaParser.py b/FourmiCrawler/parsers/WikipediaParser.py index 07263a5..54df891 100644 --- a/FourmiCrawler/parsers/WikipediaParser.py +++ b/FourmiCrawler/parsers/WikipediaParser.py @@ -56,37 +56,6 @@ class WikipediaParser(Parser): print item['value'] return items - def getboilingpoint(self, sel): - item=Result() - item['attribute']="Boiling point" - item['value']= sel.xpath('//tr/td/a[@title="Boiling point"]/../../td[2]/text()').extract() # ('//tr[contains(@href, "/wiki/Melting_point")]/text()').extract() - item['source']= "Wikipedia" - return item - - def getdensity(self, sel): - item=Result() - item['attribute']="Density" - item['value']= sel.xpath('//tr/td/a[@title="Density"]/../../td[2]/text()').extract() # ('//tr[contains(@href, "/wiki/Melting_point")]/text()').extract() - item['source']= "Wikipedia" - print item['value'] - return item - - def getheatcapacity(self, sel): - item=Result() - item['attribute']="Specific heat capacity" - item['value']= sel.xpath('//tr/td/a[@title="Specific heat capacity"]/../../td[2]/text()').extract() # ('//tr[contains(@href, "/wiki/Melting_point")]/text()').extract() - item['source']= "Wikipedia" - print item['value'] - return item - - def getmolarentropy(self, sel): - item=Result() - item['attribute']="Standard molar entropy" - item['value']= sel.xpath('//tr/td/a[@title="Standard molar entropy"]/../../td[2]/text()').extract() # ('//tr[contains(@href, "/wiki/Melting_point")]/text()').extract() - item['source']= "Wikipedia" - print item['value'] - return item - def getchemspider(self, sel): link=sel.xpath('//tr/td/a[@title="ChemSpider"]/../../td[2]/span/a/@href').extract()[0] # ('//tr[contains(@href, "/wiki/Melting_point")]/text()').extract() print link From fd5faf22e4b4816a213c233ac7793ee47afcb8af Mon Sep 17 00:00:00 2001 From: Bas Vb Date: Wed, 23 Apr 2014 11:12:58 +0200 Subject: [PATCH 21/34] Added empty reliability and condition to prevent errors for now --- FourmiCrawler/parsers/WikipediaParser.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/FourmiCrawler/parsers/WikipediaParser.py b/FourmiCrawler/parsers/WikipediaParser.py index 54df891..4ca11c4 100644 --- a/FourmiCrawler/parsers/WikipediaParser.py +++ b/FourmiCrawler/parsers/WikipediaParser.py @@ -33,6 +33,8 @@ class WikipediaParser(Parser): item['attribute'] = prop_name.extract().encode('utf-8') item['value'] = prop_values[i].extract().encode('utf-8') item['source'] = "Wikipedia" + item['reliability'] = "" + item['conditions'] = "" items.append(item) #print "new: " + item['attribute'] #print item['value'] From cb299df96f99784fa40b7cda9be7808b5d988716 Mon Sep 17 00:00:00 2001 From: Bas Vb Date: Wed, 23 Apr 2014 11:46:43 +0200 Subject: [PATCH 22/34] Added log statements --- FourmiCrawler/parsers/WikipediaParser.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/FourmiCrawler/parsers/WikipediaParser.py b/FourmiCrawler/parsers/WikipediaParser.py index 4ca11c4..9c4f8ed 100644 --- a/FourmiCrawler/parsers/WikipediaParser.py +++ b/FourmiCrawler/parsers/WikipediaParser.py @@ -1,4 +1,5 @@ from scrapy.http import Request +from scrapy import log from parser import Parser from scrapy.selector import Selector from FourmiCrawler.items import Result @@ -18,7 +19,7 @@ class WikipediaParser(Parser): def parse(self, response): print response.url - #self.log('A response from %s just arrived!' % response.url) + log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG) sel = Selector(response) items = self.parse_infobox(sel) return items @@ -36,10 +37,8 @@ class WikipediaParser(Parser): item['reliability'] = "" item['conditions'] = "" items.append(item) - #print "new: " + item['attribute'] - #print item['value'] + log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG) items=filter(lambda a: a['value']!='', items) #remove items with an empty value - #print items self.cleanitems(items) return items From 6dd03c293ad298dbe3bd916dace7515440b50922 Mon Sep 17 00:00:00 2001 From: Nout van Deijck Date: Wed, 23 Apr 2014 12:08:33 +0200 Subject: [PATCH 23/34] Added check for already visited redirects of compounds --- FourmiCrawler/parsers/WikipediaParser.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/FourmiCrawler/parsers/WikipediaParser.py b/FourmiCrawler/parsers/WikipediaParser.py index 9c4f8ed..ddfbd48 100644 --- a/FourmiCrawler/parsers/WikipediaParser.py +++ b/FourmiCrawler/parsers/WikipediaParser.py @@ -13,6 +13,7 @@ class WikipediaParser(Parser): website = "http://en.wikipedia.org/wiki/*" __spider = None + searched_compounds = [] #def __init__(self, csid): # self.website = "http://en.wikipedia.org/wiki/{id}".format(id=csid) @@ -21,11 +22,17 @@ class WikipediaParser(Parser): print response.url log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG) sel = Selector(response) - items = self.parse_infobox(sel) - return items + compound = sel.xpath('//h1[@id="firstHeading"]//span/text()').extract()[0] + if compound in self.searched_compounds: + return None + else: + items = self.parse_infobox(sel) + self.searched_compounds.append(compound) + return items def parse_infobox(self, sel): items=[] + tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]').xpath('normalize-space(string())') prop_names = tr_list[::2] prop_values = tr_list[1::2] From f926f86d7ddeadde5b5cd9333d58be97b99cbb43 Mon Sep 17 00:00:00 2001 From: Bas Vb Date: Wed, 23 Apr 2014 12:14:20 +0200 Subject: [PATCH 24/34] Small fix because the cleaned up items were not send back --- FourmiCrawler/parsers/WikipediaParser.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/FourmiCrawler/parsers/WikipediaParser.py b/FourmiCrawler/parsers/WikipediaParser.py index ddfbd48..b339a73 100644 --- a/FourmiCrawler/parsers/WikipediaParser.py +++ b/FourmiCrawler/parsers/WikipediaParser.py @@ -46,8 +46,7 @@ class WikipediaParser(Parser): items.append(item) log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG) items=filter(lambda a: a['value']!='', items) #remove items with an empty value - self.cleanitems(items) - return items + return self.cleanitems(items) def new_compound_request(self, compound): return Request(url=self.website[:-1] + compound, callback=self.parse) From b5c83125f7f5fea42677761785598f4894c1a731 Mon Sep 17 00:00:00 2001 From: Nout van Deijck Date: Wed, 23 Apr 2014 12:27:53 +0200 Subject: [PATCH 25/34] Added extra request for chemspider link retreived from Wikipedia --- FourmiCrawler/parsers/WikipediaParser.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/FourmiCrawler/parsers/WikipediaParser.py b/FourmiCrawler/parsers/WikipediaParser.py index b339a73..b60b98d 100644 --- a/FourmiCrawler/parsers/WikipediaParser.py +++ b/FourmiCrawler/parsers/WikipediaParser.py @@ -46,7 +46,10 @@ class WikipediaParser(Parser): items.append(item) log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG) items=filter(lambda a: a['value']!='', items) #remove items with an empty value - return self.cleanitems(items) + itemlist=self.cleanitems(items) + request=Request(self.getchemspider(sel)) + itemlist.append(request) + return itemlist def new_compound_request(self, compound): return Request(url=self.website[:-1] + compound, callback=self.parse) From 1ced65e2b6cf4843ef403586d722b36c3a814955 Mon Sep 17 00:00:00 2001 From: Nout van Deijck Date: Wed, 23 Apr 2014 13:18:50 +0200 Subject: [PATCH 26/34] Parser now adds extra requests for every identifier to an external source that is in the Wikipedia chembox --- FourmiCrawler/parsers/WikipediaParser.py | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/FourmiCrawler/parsers/WikipediaParser.py b/FourmiCrawler/parsers/WikipediaParser.py index b60b98d..5eea757 100644 --- a/FourmiCrawler/parsers/WikipediaParser.py +++ b/FourmiCrawler/parsers/WikipediaParser.py @@ -47,8 +47,14 @@ class WikipediaParser(Parser): log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG) items=filter(lambda a: a['value']!='', items) #remove items with an empty value itemlist=self.cleanitems(items) - request=Request(self.getchemspider(sel)) - itemlist.append(request) + + # request=Request(self.getchemspider(sel)) + # itemlist.append(request) + for identifier in self.get_identifiers(sel): + request_identifier=Request(identifier) + # print request_identifier + itemlist.append(request_identifier) + return itemlist def new_compound_request(self, compound): @@ -67,6 +73,15 @@ class WikipediaParser(Parser): return items def getchemspider(self, sel): - link=sel.xpath('//tr/td/a[@title="ChemSpider"]/../../td[2]/span/a/@href').extract()[0] # ('//tr[contains(@href, "/wiki/Melting_point")]/text()').extract() + link=sel.xpath('//a[@title="ChemSpider"]/../../td[2]/span/a/@href').extract()[0] # ('//tr[contains(@href, "/wiki/Melting_point")]/text()').extract() print link - return link \ No newline at end of file + return link + + def get_identifiers(self, sel): + links=sel.xpath('//span[contains(concat(" ",normalize-space(@class)," "),"reflink")]/a[contains(concat(" ",normalize-space(@class)," "),"external")]/@href').extract() + # identifiers=[] + # for link in links: + # identifier=Request(link) + # identifiers.append(identifier) + # print identifiers + return links \ No newline at end of file From 3e1b33164e9327ced227d00de6758b845c4d7cdf Mon Sep 17 00:00:00 2001 From: Nout van Deijck Date: Wed, 23 Apr 2014 13:48:44 +0200 Subject: [PATCH 27/34] Some comments and trying different for loop for adding requests --- FourmiCrawler/parsers/WikipediaParser.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/FourmiCrawler/parsers/WikipediaParser.py b/FourmiCrawler/parsers/WikipediaParser.py index 5eea757..3c152ce 100644 --- a/FourmiCrawler/parsers/WikipediaParser.py +++ b/FourmiCrawler/parsers/WikipediaParser.py @@ -50,10 +50,18 @@ class WikipediaParser(Parser): # request=Request(self.getchemspider(sel)) # itemlist.append(request) - for identifier in self.get_identifiers(sel): - request_identifier=Request(identifier) - # print request_identifier - itemlist.append(request_identifier) + + identifiers=self.get_identifiers(sel) + # print identifiers + + for i, identifier in enumerate(identifiers): + request = Request(identifier) + print request + + # for identifier in self.get_identifiers(sel): + # request_identifier=Request(identifier) + # # print request_identifier + # itemlist.append(request_identifier) return itemlist @@ -79,9 +87,6 @@ class WikipediaParser(Parser): def get_identifiers(self, sel): links=sel.xpath('//span[contains(concat(" ",normalize-space(@class)," "),"reflink")]/a[contains(concat(" ",normalize-space(@class)," "),"external")]/@href').extract() - # identifiers=[] - # for link in links: - # identifier=Request(link) - # identifiers.append(identifier) - # print identifiers + + print links return links \ No newline at end of file From 62475d965d81dba7863262fdeb745791de28921b Mon Sep 17 00:00:00 2001 From: Bas Vb Date: Wed, 23 Apr 2014 15:24:57 +0200 Subject: [PATCH 28/34] Cleaning up code --- FourmiCrawler/parsers/WikipediaParser.py | 48 ++++++++++++------------ 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/FourmiCrawler/parsers/WikipediaParser.py b/FourmiCrawler/parsers/WikipediaParser.py index 3c152ce..625023c 100644 --- a/FourmiCrawler/parsers/WikipediaParser.py +++ b/FourmiCrawler/parsers/WikipediaParser.py @@ -5,6 +5,7 @@ from scrapy.selector import Selector from FourmiCrawler.items import Result import re + class WikipediaParser(Parser): # General notes: @@ -15,8 +16,8 @@ class WikipediaParser(Parser): __spider = None searched_compounds = [] - #def __init__(self, csid): - # self.website = "http://en.wikipedia.org/wiki/{id}".format(id=csid) + def __init__(self): + pass def parse(self, response): print response.url @@ -31,27 +32,29 @@ class WikipediaParser(Parser): return items def parse_infobox(self, sel): - items=[] + + items = [] tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]').xpath('normalize-space(string())') prop_names = tr_list[::2] prop_values = tr_list[1::2] for i, prop_name in enumerate(prop_names): - item = Result() - item['attribute'] = prop_name.extract().encode('utf-8') - item['value'] = prop_values[i].extract().encode('utf-8') - item['source'] = "Wikipedia" - item['reliability'] = "" - item['conditions'] = "" + item = Result({ + 'attribute': prop_name.extract().encode('utf-8'), + 'value': prop_values[i].extract().encode('utf-8'), + 'source': "Wikipedia", + 'reliability': "", + 'conditions': "" + }) items.append(item) log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG) - items=filter(lambda a: a['value']!='', items) #remove items with an empty value - itemlist=self.cleanitems(items) + items = filter(lambda a: a['value'] != '', items) # remove items with an empty value + itemlist = self.cleanitems(items) # request=Request(self.getchemspider(sel)) # itemlist.append(request) - identifiers=self.get_identifiers(sel) + identifiers = self.get_identifiers(sel) # print identifiers for i, identifier in enumerate(identifiers): @@ -70,23 +73,20 @@ class WikipediaParser(Parser): def cleanitems(self, items): for item in items: - value=item['value'] - if re.search('F;\s(\d+[\.,]?\d*)', value): - #print re.search('F;\s(\d+[\.,]?\d*)', value).group(1) - item['value']=re.search('F;\s(\d+[\.,]?\d*)', value).group(1) + " K" - if re.match('(\d+[\.,]?\d*)\sJ\sK.+mol', value): + value = item['value'] + m = re.search('F;\s(\d+[\.,]?\d*)', value) + if m: + item['value'] = m.group(1) + " K" + m = re.match('(\d+[\.,]?\d*)\sJ\sK.+mol', value) + if m: print item['value'] - item['value']=re.search('(\d+[\.,]?\d*)\sJ\sK.+mol', value).group(1) + " J/K/mol" + item['value'] = m.group(1) + " J/K/mol" print item['value'] return items - def getchemspider(self, sel): - link=sel.xpath('//a[@title="ChemSpider"]/../../td[2]/span/a/@href').extract()[0] # ('//tr[contains(@href, "/wiki/Melting_point")]/text()').extract() - print link - return link - def get_identifiers(self, sel): - links=sel.xpath('//span[contains(concat(" ",normalize-space(@class)," "),"reflink")]/a[contains(concat(" ",normalize-space(@class)," "),"external")]/@href').extract() + links = sel.xpath('//span[contains(concat(" ",normalize-space(@class)," "),"reflink")]/a' + '[contains(concat(" ",normalize-space(@class)," "),"external")]/@href').extract() print links return links \ No newline at end of file From 507006889b0696e679cd85acadf78e314bc53b7b Mon Sep 17 00:00:00 2001 From: Nout van Deijck Date: Wed, 23 Apr 2014 15:49:23 +0200 Subject: [PATCH 29/34] Fixed problem with strange urls, now adds all external identifiers as requests --- FourmiCrawler/parsers/WikipediaParser.py | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/FourmiCrawler/parsers/WikipediaParser.py b/FourmiCrawler/parsers/WikipediaParser.py index 625023c..abca68b 100644 --- a/FourmiCrawler/parsers/WikipediaParser.py +++ b/FourmiCrawler/parsers/WikipediaParser.py @@ -51,20 +51,17 @@ class WikipediaParser(Parser): items = filter(lambda a: a['value'] != '', items) # remove items with an empty value itemlist = self.cleanitems(items) - # request=Request(self.getchemspider(sel)) - # itemlist.append(request) - identifiers = self.get_identifiers(sel) - # print identifiers for i, identifier in enumerate(identifiers): - request = Request(identifier) - print request - - # for identifier in self.get_identifiers(sel): - # request_identifier=Request(identifier) - # # print request_identifier - # itemlist.append(request_identifier) + if re.match('//en\.wikipedia',identifier): + pass + elif re.match('/{2}',identifier): + identifier = re.sub("/{2}", "http://", identifier) + request = Request(identifier) + else: + request = Request(identifier) + itemlist.append(request) return itemlist @@ -87,6 +84,4 @@ class WikipediaParser(Parser): def get_identifiers(self, sel): links = sel.xpath('//span[contains(concat(" ",normalize-space(@class)," "),"reflink")]/a' '[contains(concat(" ",normalize-space(@class)," "),"external")]/@href').extract() - - print links return links \ No newline at end of file From 9cefd336e0b5daeaf96d8affc68d395029a1857b Mon Sep 17 00:00:00 2001 From: Nout van Deijck Date: Wed, 23 Apr 2014 16:02:37 +0200 Subject: [PATCH 30/34] Cleaning up code and added log messages --- FourmiCrawler/parsers/WikipediaParser.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/FourmiCrawler/parsers/WikipediaParser.py b/FourmiCrawler/parsers/WikipediaParser.py index abca68b..b671197 100644 --- a/FourmiCrawler/parsers/WikipediaParser.py +++ b/FourmiCrawler/parsers/WikipediaParser.py @@ -20,7 +20,6 @@ class WikipediaParser(Parser): pass def parse(self, response): - print response.url log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG) sel = Selector(response) compound = sel.xpath('//h1[@id="firstHeading"]//span/text()').extract()[0] @@ -35,7 +34,7 @@ class WikipediaParser(Parser): items = [] - tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]').xpath('normalize-space(string())') + tr_list = sel.xpath('.//table[@class="infobox bordered" or @class="infobox"]//td[not(@colspan)]').xpath('normalize-space(string())') prop_names = tr_list[::2] prop_values = tr_list[1::2] for i, prop_name in enumerate(prop_names): @@ -55,12 +54,13 @@ class WikipediaParser(Parser): for i, identifier in enumerate(identifiers): if re.match('//en\.wikipedia',identifier): - pass + log.msg('Found link to wikipedia, this is not something to scrape: %s' % identifier, level=log.WARNING) elif re.match('/{2}',identifier): identifier = re.sub("/{2}", "http://", identifier) request = Request(identifier) else: request = Request(identifier) + log.msg('New identifier found, request: %s' % identifier, level=log.DEBUG) itemlist.append(request) return itemlist @@ -76,9 +76,7 @@ class WikipediaParser(Parser): item['value'] = m.group(1) + " K" m = re.match('(\d+[\.,]?\d*)\sJ\sK.+mol', value) if m: - print item['value'] item['value'] = m.group(1) + " J/K/mol" - print item['value'] return items def get_identifiers(self, sel): From 150fc5bea7142102c3d86196c68f0a84510f3316 Mon Sep 17 00:00:00 2001 From: Nout van Deijck Date: Wed, 23 Apr 2014 16:17:23 +0200 Subject: [PATCH 31/34] added comments --- FourmiCrawler/parsers/WikipediaParser.py | 34 ++++++++++++++++-------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/FourmiCrawler/parsers/WikipediaParser.py b/FourmiCrawler/parsers/WikipediaParser.py index b671197..3bcf786 100644 --- a/FourmiCrawler/parsers/WikipediaParser.py +++ b/FourmiCrawler/parsers/WikipediaParser.py @@ -8,9 +8,11 @@ import re class WikipediaParser(Parser): -# General notes: -# Redirects seem to not matter as Wikipedia returns the page the redirect forwards to -# although this might lead to scraping both the original and the redirect with the same data. + """ Wikipedia scraper for chemical properties + + This parser parses Wikipedia infoboxes (also bordered) to obtain properties and their values. + It also returns requests with other external sources which contain information on parsed subject. + """ website = "http://en.wikipedia.org/wiki/*" __spider = None @@ -31,10 +33,12 @@ class WikipediaParser(Parser): return items def parse_infobox(self, sel): - + #scrape data from infobox on wikipedia. items = [] - tr_list = sel.xpath('.//table[@class="infobox bordered" or @class="infobox"]//td[not(@colspan)]').xpath('normalize-space(string())') + #be sure to get both chembox (wikipedia template) and drugbox (wikipedia template) to scrape + tr_list = sel.xpath('.//table[@class="infobox bordered" or @class="infobox"]//td[not(@colspan)]').\ + xpath('normalize-space(string())') prop_names = tr_list[::2] prop_values = tr_list[1::2] for i, prop_name in enumerate(prop_names): @@ -52,10 +56,14 @@ class WikipediaParser(Parser): identifiers = self.get_identifiers(sel) + #add extra sources to scrape from as requests for i, identifier in enumerate(identifiers): - if re.match('//en\.wikipedia',identifier): + request = None + #discard internal wikipedia links + if re.match('//en\.wikipedia', identifier): log.msg('Found link to wikipedia, this is not something to scrape: %s' % identifier, level=log.WARNING) - elif re.match('/{2}',identifier): + #fix links starting with '//www.' + elif re.match('/{2}', identifier): identifier = re.sub("/{2}", "http://", identifier) request = Request(identifier) else: @@ -68,18 +76,22 @@ class WikipediaParser(Parser): def new_compound_request(self, compound): return Request(url=self.website[:-1] + compound, callback=self.parse) - def cleanitems(self, items): + @staticmethod + def cleanitems(items): + #clean up properties using regex, makes it possible to split the values from the units for item in items: value = item['value'] - m = re.search('F;\s(\d+[\.,]?\d*)', value) + m = re.search('F;\s(\d+[\.,]?\d*)', value) # clean up numerical Kelvin value (after F) if m: item['value'] = m.group(1) + " K" - m = re.match('(\d+[\.,]?\d*)\sJ\sK.+mol', value) + m = re.match('(\d+[\.,]?\d*)\sJ\sK.+mol', value) # clean up J/K/mol values if m: item['value'] = m.group(1) + " J/K/mol" return items - def get_identifiers(self, sel): + @staticmethod + def get_identifiers(sel): + #find external links, named 'Identifiers' to different sources. links = sel.xpath('//span[contains(concat(" ",normalize-space(@class)," "),"reflink")]/a' '[contains(concat(" ",normalize-space(@class)," "),"external")]/@href').extract() return links \ No newline at end of file From 9cbdf57238629c91e8d56f7965af5715ea65790b Mon Sep 17 00:00:00 2001 From: Nout van Deijck Date: Wed, 23 Apr 2014 16:24:27 +0200 Subject: [PATCH 32/34] fixed comments --- FourmiCrawler/parsers/WikipediaParser.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/FourmiCrawler/parsers/WikipediaParser.py b/FourmiCrawler/parsers/WikipediaParser.py index 3bcf786..a5b95b5 100644 --- a/FourmiCrawler/parsers/WikipediaParser.py +++ b/FourmiCrawler/parsers/WikipediaParser.py @@ -22,9 +22,10 @@ class WikipediaParser(Parser): pass def parse(self, response): + """ Distributes the above described behaviour """ log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG) sel = Selector(response) - compound = sel.xpath('//h1[@id="firstHeading"]//span/text()').extract()[0] + compound = sel.xpath('//h1[@id="firstHeading"]//span/text()').extract()[0] # makes sure to use main page if compound in self.searched_compounds: return None else: @@ -33,7 +34,7 @@ class WikipediaParser(Parser): return items def parse_infobox(self, sel): - #scrape data from infobox on wikipedia. + """ scrape data from infobox on wikipedia. """ items = [] #be sure to get both chembox (wikipedia template) and drugbox (wikipedia template) to scrape @@ -78,7 +79,7 @@ class WikipediaParser(Parser): @staticmethod def cleanitems(items): - #clean up properties using regex, makes it possible to split the values from the units + """ clean up properties using regex, makes it possible to split the values from the units """ for item in items: value = item['value'] m = re.search('F;\s(\d+[\.,]?\d*)', value) # clean up numerical Kelvin value (after F) @@ -91,7 +92,7 @@ class WikipediaParser(Parser): @staticmethod def get_identifiers(sel): - #find external links, named 'Identifiers' to different sources. + """ find external links, named 'Identifiers' to different sources. """ links = sel.xpath('//span[contains(concat(" ",normalize-space(@class)," "),"reflink")]/a' '[contains(concat(" ",normalize-space(@class)," "),"external")]/@href').extract() return links \ No newline at end of file From c5bffffeda3944dff5536eca2fafe066f24b5f84 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Wed, 23 Apr 2014 22:55:28 +0200 Subject: [PATCH 33/34] Delayed refractor from developing branch --- FourmiCrawler/{parsers => sources}/WikipediaParser.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) rename FourmiCrawler/{parsers => sources}/WikipediaParser.py (97%) diff --git a/FourmiCrawler/parsers/WikipediaParser.py b/FourmiCrawler/sources/WikipediaParser.py similarity index 97% rename from FourmiCrawler/parsers/WikipediaParser.py rename to FourmiCrawler/sources/WikipediaParser.py index a5b95b5..f13d0cf 100644 --- a/FourmiCrawler/parsers/WikipediaParser.py +++ b/FourmiCrawler/sources/WikipediaParser.py @@ -1,12 +1,12 @@ from scrapy.http import Request from scrapy import log -from parser import Parser +from source import Source from scrapy.selector import Selector from FourmiCrawler.items import Result import re -class WikipediaParser(Parser): +class WikipediaParser(Source): """ Wikipedia scraper for chemical properties @@ -19,7 +19,7 @@ class WikipediaParser(Parser): searched_compounds = [] def __init__(self): - pass + Source.__init__(self) def parse(self, response): """ Distributes the above described behaviour """ From d523d4edcdc326aa1e316889e98676206578ab80 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Wed, 23 Apr 2014 22:58:04 +0200 Subject: [PATCH 34/34] Spelling errors --- FourmiCrawler/sources/WikipediaParser.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/FourmiCrawler/sources/WikipediaParser.py b/FourmiCrawler/sources/WikipediaParser.py index f13d0cf..c251fca 100644 --- a/FourmiCrawler/sources/WikipediaParser.py +++ b/FourmiCrawler/sources/WikipediaParser.py @@ -7,10 +7,9 @@ import re class WikipediaParser(Source): - """ Wikipedia scraper for chemical properties - This parser parses Wikipedia infoboxes (also bordered) to obtain properties and their values. + This parser parses Wikipedia info boxes (also bordered) to obtain properties and their values. It also returns requests with other external sources which contain information on parsed subject. """ @@ -53,7 +52,7 @@ class WikipediaParser(Source): items.append(item) log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG) items = filter(lambda a: a['value'] != '', items) # remove items with an empty value - itemlist = self.cleanitems(items) + item_list = self.clean_items(items) identifiers = self.get_identifiers(sel) @@ -62,7 +61,7 @@ class WikipediaParser(Source): request = None #discard internal wikipedia links if re.match('//en\.wikipedia', identifier): - log.msg('Found link to wikipedia, this is not something to scrape: %s' % identifier, level=log.WARNING) + log.msg('Found link to Wikipedia, this is not something to scrape: %s' % identifier, level=log.WARNING) #fix links starting with '//www.' elif re.match('/{2}', identifier): identifier = re.sub("/{2}", "http://", identifier) @@ -70,15 +69,15 @@ class WikipediaParser(Source): else: request = Request(identifier) log.msg('New identifier found, request: %s' % identifier, level=log.DEBUG) - itemlist.append(request) + item_list.append(request) - return itemlist + return item_list def new_compound_request(self, compound): return Request(url=self.website[:-1] + compound, callback=self.parse) @staticmethod - def cleanitems(items): + def clean_items(items): """ clean up properties using regex, makes it possible to split the values from the units """ for item in items: value = item['value']