From 0da286c90753f021e33fa37c2e3bb27fe12b25c8 Mon Sep 17 00:00:00 2001 From: RTB Date: Tue, 8 Apr 2014 12:08:45 +0200 Subject: [PATCH 01/52] created basic structure of ChemSpider search parser --- FourmiCrawler/parsers/ChemSpider.py | 32 +++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 FourmiCrawler/parsers/ChemSpider.py diff --git a/FourmiCrawler/parsers/ChemSpider.py b/FourmiCrawler/parsers/ChemSpider.py new file mode 100644 index 0000000..bd69e58 --- /dev/null +++ b/FourmiCrawler/parsers/ChemSpider.py @@ -0,0 +1,32 @@ +from scrapy import log +from scrapy.http import Request +from scrapy.selector import Selector +from FourmiCrawler.items import Result +from ChemSpider_token import TOKEN #TODO: move the token elsewhere + +""" +This parser will manage searching for chemicals through the ChemsSpider API, +and parsing the resulting ChemSpider page. +The token required for the API should be in a configuration file somewhere. +""" +class ChemSpider: + + website = "http://www.chemspider.com/*" + __spider = 'ChemSpider' + + search = "Search.asmx/SimpleSearch?query=%s&token=%s" + + print "ChemSpider start" + log.msg('chemspider start', level=log.DEBUG) + + def parse(self, response): + sel = Selector(response) + log.msg('chemspider parse', level=log.DEBUG) + print "ChemSpider parse" + pass + + def new_compound_request(self,compound): + searchurl = website[:-1] + search % (compound, TOKEN) + log.msg('chemspider compound', level=log.DEBUG) + print "ChemSpider compound" + return Request(url=searchurl, callback=self.parse) From a4dc8c87112e2e541852e4c0aab3ed6782e5a6dd Mon Sep 17 00:00:00 2001 From: RTB Date: Tue, 8 Apr 2014 13:10:02 +0200 Subject: [PATCH 02/52] corrected Chemspider parser to be a subclass of Parser --- FourmiCrawler/parsers/ChemSpider.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/FourmiCrawler/parsers/ChemSpider.py b/FourmiCrawler/parsers/ChemSpider.py index bd69e58..3890150 100644 --- a/FourmiCrawler/parsers/ChemSpider.py +++ b/FourmiCrawler/parsers/ChemSpider.py @@ -1,3 +1,4 @@ +from parser import Parser from scrapy import log from scrapy.http import Request from scrapy.selector import Selector @@ -9,7 +10,7 @@ This parser will manage searching for chemicals through the ChemsSpider API, and parsing the resulting ChemSpider page. The token required for the API should be in a configuration file somewhere. """ -class ChemSpider: +class ChemSpider(Parser): website = "http://www.chemspider.com/*" __spider = 'ChemSpider' @@ -26,7 +27,7 @@ class ChemSpider: pass def new_compound_request(self,compound): - searchurl = website[:-1] + search % (compound, TOKEN) + searchurl = self.website[:-1] + self.search % (compound, TOKEN) log.msg('chemspider compound', level=log.DEBUG) print "ChemSpider compound" return Request(url=searchurl, callback=self.parse) From 0e3ef9a79222aa5e18436d021fea79f877423765 Mon Sep 17 00:00:00 2001 From: RTB Date: Tue, 8 Apr 2014 16:14:47 +0200 Subject: [PATCH 03/52] hardcoded ChemSpider API token into ChemSpider.py --- FourmiCrawler/parsers/ChemSpider.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/FourmiCrawler/parsers/ChemSpider.py b/FourmiCrawler/parsers/ChemSpider.py index 3890150..de05639 100644 --- a/FourmiCrawler/parsers/ChemSpider.py +++ b/FourmiCrawler/parsers/ChemSpider.py @@ -3,7 +3,6 @@ from scrapy import log from scrapy.http import Request from scrapy.selector import Selector from FourmiCrawler.items import Result -from ChemSpider_token import TOKEN #TODO: move the token elsewhere """ This parser will manage searching for chemicals through the ChemsSpider API, @@ -15,7 +14,7 @@ class ChemSpider(Parser): website = "http://www.chemspider.com/*" __spider = 'ChemSpider' - search = "Search.asmx/SimpleSearch?query=%s&token=%s" + search = "Search.asmx/SimpleSearch?query=%s&token=052bfd06-5ce4-43d6-bf12-89eabefd2338" print "ChemSpider start" log.msg('chemspider start', level=log.DEBUG) @@ -27,7 +26,7 @@ class ChemSpider(Parser): pass def new_compound_request(self,compound): - searchurl = self.website[:-1] + self.search % (compound, TOKEN) + searchurl = self.website[:-1] + self.search % compound log.msg('chemspider compound', level=log.DEBUG) print "ChemSpider compound" return Request(url=searchurl, callback=self.parse) From 246463b450f349cd5c1f7190d303b90ab004f97b Mon Sep 17 00:00:00 2001 From: RTB Date: Sat, 12 Apr 2014 19:19:56 +0200 Subject: [PATCH 04/52] simplified debug output, WARNING label should be temporary --- FourmiCrawler/parsers/ChemSpider.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/FourmiCrawler/parsers/ChemSpider.py b/FourmiCrawler/parsers/ChemSpider.py index de05639..ee01d33 100644 --- a/FourmiCrawler/parsers/ChemSpider.py +++ b/FourmiCrawler/parsers/ChemSpider.py @@ -16,17 +16,11 @@ class ChemSpider(Parser): search = "Search.asmx/SimpleSearch?query=%s&token=052bfd06-5ce4-43d6-bf12-89eabefd2338" - print "ChemSpider start" - log.msg('chemspider start', level=log.DEBUG) - def parse(self, response): sel = Selector(response) - log.msg('chemspider parse', level=log.DEBUG) - print "ChemSpider parse" - pass + log.msg('chemspider parse', level=log.WARNING) def new_compound_request(self,compound): searchurl = self.website[:-1] + self.search % compound - log.msg('chemspider compound', level=log.DEBUG) - print "ChemSpider compound" + log.msg('chemspider compound', level=log.WARNING) return Request(url=searchurl, callback=self.parse) From 22fa67735d87b372601db0185724fab0c47b3a29 Mon Sep 17 00:00:00 2001 From: RTB Date: Sat, 12 Apr 2014 19:41:36 +0200 Subject: [PATCH 05/52] added parse_searchrequest function --- FourmiCrawler/parsers/ChemSpider.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/FourmiCrawler/parsers/ChemSpider.py b/FourmiCrawler/parsers/ChemSpider.py index ee01d33..ab62578 100644 --- a/FourmiCrawler/parsers/ChemSpider.py +++ b/FourmiCrawler/parsers/ChemSpider.py @@ -15,12 +15,23 @@ class ChemSpider(Parser): __spider = 'ChemSpider' search = "Search.asmx/SimpleSearch?query=%s&token=052bfd06-5ce4-43d6-bf12-89eabefd2338" + structure = "Chemical-Structure.%s.html" def parse(self, response): sel = Selector(response) log.msg('chemspider parse', level=log.WARNING) + + def parse_searchrequest(self, response): + sel = Selector(response) + log.msg('chemspider parse_searchrequest', level=log.WARNING) + sel.register_namespace('cs', 'http://www.chemspider.com/') + csid = sel.xpath('.//cs:int/text()').extract()[0] + #TODO: handle multiple csids in case of vague search term + structure_url = self.website[:-1] + self.structure % csid + log.msg('chemspider URL: %s' % structure_url, level=log.WARNING) + return Request(structure_url, callback=self.parse) def new_compound_request(self,compound): searchurl = self.website[:-1] + self.search % compound log.msg('chemspider compound', level=log.WARNING) - return Request(url=searchurl, callback=self.parse) + return Request(url=searchurl, callback=self.parse_searchrequest) From 859a18c61a260fc1546df78c05e07a24468c91d1 Mon Sep 17 00:00:00 2001 From: RTB Date: Sat, 12 Apr 2014 22:27:28 +0200 Subject: [PATCH 06/52] added parsing of synonyms --- FourmiCrawler/parsers/ChemSpider.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/FourmiCrawler/parsers/ChemSpider.py b/FourmiCrawler/parsers/ChemSpider.py index ab62578..adfad2f 100644 --- a/FourmiCrawler/parsers/ChemSpider.py +++ b/FourmiCrawler/parsers/ChemSpider.py @@ -19,7 +19,26 @@ class ChemSpider(Parser): def parse(self, response): sel = Selector(response) - log.msg('chemspider parse', level=log.WARNING) + synonyms = [] + for syn in sel.xpath('//p[@class="syn"]/strong/text()').extract(): + synonyms.append( self.new_synonym( syn, 'high' ) ) + for syn in sel.xpath('//p[@class="syn"]/span[@class="synonym_confirmed"]/text()').extract(): + synonyms.append( self.new_synonym( syn, 'medium' ) ) + for syn in sel.xpath('//p[@class="syn"]/span[@class=""]/text()').extract(): + synonyms.append( self.new_synonym( syn, 'low' ) ) + + return synonyms + + def new_synonym(self, name, reliability): + log.msg('CS synonym: %s (%s)' % (name, reliability), level=log.WARNING) + synonym = Result() + synonym['attribute'] = 'synonym' + synonym['value'] = name + synonym['source'] = self.__spider + synonym['reliability'] = reliability + synonym['conditions'] = None + return synonym + def parse_searchrequest(self, response): sel = Selector(response) From 5565c28a1ee8b1c3315b526250d62c95f5fca1e4 Mon Sep 17 00:00:00 2001 From: RTB Date: Sun, 13 Apr 2014 23:14:23 +0200 Subject: [PATCH 07/52] moved parsing of synonyms to 'parse_synonyms' function --- FourmiCrawler/parsers/ChemSpider.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/FourmiCrawler/parsers/ChemSpider.py b/FourmiCrawler/parsers/ChemSpider.py index adfad2f..01eb274 100644 --- a/FourmiCrawler/parsers/ChemSpider.py +++ b/FourmiCrawler/parsers/ChemSpider.py @@ -19,6 +19,13 @@ class ChemSpider(Parser): def parse(self, response): sel = Selector(response) + requests = [] + requests_synonyms = self.parse_synonyms(sel) + requests.extend(requests_synonyms) + return requests + + def parse_synonyms(self, sel): + requests = [] synonyms = [] for syn in sel.xpath('//p[@class="syn"]/strong/text()').extract(): synonyms.append( self.new_synonym( syn, 'high' ) ) @@ -27,7 +34,7 @@ class ChemSpider(Parser): for syn in sel.xpath('//p[@class="syn"]/span[@class=""]/text()').extract(): synonyms.append( self.new_synonym( syn, 'low' ) ) - return synonyms + return requests def new_synonym(self, name, reliability): log.msg('CS synonym: %s (%s)' % (name, reliability), level=log.WARNING) From 0ad98905e3cfca43e34d92cede65cfa7ec727469 Mon Sep 17 00:00:00 2001 From: RTB Date: Sun, 13 Apr 2014 23:35:25 +0200 Subject: [PATCH 08/52] added scraping for wikipedia links in synonym tab --- FourmiCrawler/parsers/ChemSpider.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/FourmiCrawler/parsers/ChemSpider.py b/FourmiCrawler/parsers/ChemSpider.py index 01eb274..679e4ca 100644 --- a/FourmiCrawler/parsers/ChemSpider.py +++ b/FourmiCrawler/parsers/ChemSpider.py @@ -22,6 +22,9 @@ class ChemSpider(Parser): requests = [] requests_synonyms = self.parse_synonyms(sel) requests.extend(requests_synonyms) + for wiki_url in sel.xpath('.//a[@title="Wiki"]/@href').extract(): + requests.append( Request(url=wiki_url) ) + return requests def parse_synonyms(self, sel): From b1b969a16c7bb0e45e9b906d0ececd54514e7e79 Mon Sep 17 00:00:00 2001 From: RTB Date: Mon, 14 Apr 2014 00:28:47 +0200 Subject: [PATCH 09/52] corrected usage of __spider variable --- FourmiCrawler/parsers/ChemSpider.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/FourmiCrawler/parsers/ChemSpider.py b/FourmiCrawler/parsers/ChemSpider.py index 679e4ca..01a7c95 100644 --- a/FourmiCrawler/parsers/ChemSpider.py +++ b/FourmiCrawler/parsers/ChemSpider.py @@ -12,7 +12,6 @@ The token required for the API should be in a configuration file somewhere. class ChemSpider(Parser): website = "http://www.chemspider.com/*" - __spider = 'ChemSpider' search = "Search.asmx/SimpleSearch?query=%s&token=052bfd06-5ce4-43d6-bf12-89eabefd2338" structure = "Chemical-Structure.%s.html" @@ -44,7 +43,7 @@ class ChemSpider(Parser): synonym = Result() synonym['attribute'] = 'synonym' synonym['value'] = name - synonym['source'] = self.__spider + synonym['source'] = 'ChemSpider' synonym['reliability'] = reliability synonym['conditions'] = None return synonym From 564dbc32929d8e45b22f85e219adc22cf2cb1bd5 Mon Sep 17 00:00:00 2001 From: RTB Date: Mon, 14 Apr 2014 00:33:25 +0200 Subject: [PATCH 10/52] added ignore list to new_compound_request for synonyms found by chemspider parser --- FourmiCrawler/parsers/ChemSpider.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/FourmiCrawler/parsers/ChemSpider.py b/FourmiCrawler/parsers/ChemSpider.py index 01a7c95..fb0b9fa 100644 --- a/FourmiCrawler/parsers/ChemSpider.py +++ b/FourmiCrawler/parsers/ChemSpider.py @@ -16,6 +16,8 @@ class ChemSpider(Parser): search = "Search.asmx/SimpleSearch?query=%s&token=052bfd06-5ce4-43d6-bf12-89eabefd2338" structure = "Chemical-Structure.%s.html" + ignore_list = [] + def parse(self, response): sel = Selector(response) requests = [] @@ -36,6 +38,8 @@ class ChemSpider(Parser): for syn in sel.xpath('//p[@class="syn"]/span[@class=""]/text()').extract(): synonyms.append( self.new_synonym( syn, 'low' ) ) + self.ignore_list.extend(synonyms) + return requests def new_synonym(self, name, reliability): @@ -60,6 +64,8 @@ class ChemSpider(Parser): return Request(structure_url, callback=self.parse) def new_compound_request(self,compound): + if compound in self.ignore_list: #TODO: add regular expression + return None searchurl = self.website[:-1] + self.search % compound log.msg('chemspider compound', level=log.WARNING) return Request(url=searchurl, callback=self.parse_searchrequest) From e95df8eaa3c8dc77b2549055b4011b55ee04df49 Mon Sep 17 00:00:00 2001 From: RTB Date: Mon, 14 Apr 2014 01:20:24 +0200 Subject: [PATCH 11/52] ignore_list now contains the intended names instead of Result objects --- FourmiCrawler/parsers/ChemSpider.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/FourmiCrawler/parsers/ChemSpider.py b/FourmiCrawler/parsers/ChemSpider.py index fb0b9fa..72f828a 100644 --- a/FourmiCrawler/parsers/ChemSpider.py +++ b/FourmiCrawler/parsers/ChemSpider.py @@ -38,12 +38,11 @@ class ChemSpider(Parser): for syn in sel.xpath('//p[@class="syn"]/span[@class=""]/text()').extract(): synonyms.append( self.new_synonym( syn, 'low' ) ) - self.ignore_list.extend(synonyms) - return requests def new_synonym(self, name, reliability): log.msg('CS synonym: %s (%s)' % (name, reliability), level=log.WARNING) + self.ignore_list.append(name) synonym = Result() synonym['attribute'] = 'synonym' synonym['value'] = name From 31a63829f8235aa2fd3406d147957519e3157b19 Mon Sep 17 00:00:00 2001 From: RTB Date: Mon, 14 Apr 2014 01:23:15 +0200 Subject: [PATCH 12/52] chemspider parser now makes new synonym requests with the scraped synonyms --- FourmiCrawler/parsers/ChemSpider.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/FourmiCrawler/parsers/ChemSpider.py b/FourmiCrawler/parsers/ChemSpider.py index 72f828a..cad8490 100644 --- a/FourmiCrawler/parsers/ChemSpider.py +++ b/FourmiCrawler/parsers/ChemSpider.py @@ -38,6 +38,10 @@ class ChemSpider(Parser): for syn in sel.xpath('//p[@class="syn"]/span[@class=""]/text()').extract(): synonyms.append( self.new_synonym( syn, 'low' ) ) + for synonym in synonyms: + if synonym['reliability'] == 'high': + self._Parser__spider.get_synonym_requests(synonym['value']) + return requests def new_synonym(self, name, reliability): From 2ae3ac9c51ae580f66e5ba7ceca2cb7cfc75d85f Mon Sep 17 00:00:00 2001 From: RTB Date: Mon, 14 Apr 2014 13:09:14 +0200 Subject: [PATCH 13/52] added parse_properties to scrape the Experimental Physico-chemical Properties table if it exists --- FourmiCrawler/parsers/ChemSpider.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/FourmiCrawler/parsers/ChemSpider.py b/FourmiCrawler/parsers/ChemSpider.py index cad8490..1dbc994 100644 --- a/FourmiCrawler/parsers/ChemSpider.py +++ b/FourmiCrawler/parsers/ChemSpider.py @@ -23,11 +23,36 @@ class ChemSpider(Parser): requests = [] requests_synonyms = self.parse_synonyms(sel) requests.extend(requests_synonyms) + requests_properties = self.parse_properties(sel) + requests.extend(requests_properties) for wiki_url in sel.xpath('.//a[@title="Wiki"]/@href').extract(): requests.append( Request(url=wiki_url) ) return requests + def parse_properties(self, sel): + requests = [] + properties = [] + scraped_list = sel.xpath('.//li[span="Experimental Physico-chemical Properties"]//li/table/tr/td') + if not scraped_list: + return None + property_name = scraped_list.pop(0).xpath('span/text()').extract()[0].rstrip() + for line in scraped_list: + if line.xpath('span/text()'): + property_name = line.xpath('span/text()').extract()[0].rstrip() + else: + new_prop = Result() + new_prop['attribute'] = property_name + new_prop['value'] = line.xpath('text()').extract()[0].rstrip() + new_prop['source'] = line.xpath('strong/text()').extract()[0].rstrip() + new_prop['reliability'] = None + new_prop['conditions'] = None + properties.append(new_prop) + log.msg('CS prop: |%s| |%s| |%s|' \ + % (new_prop['attribute'],new_prop['value'], new_prop['source']), + level=log.WARNING) + return properties + def parse_synonyms(self, sel): requests = [] synonyms = [] From ff0eb309da57f52b5d8a3902bc21abfb69cf536c Mon Sep 17 00:00:00 2001 From: RTB Date: Mon, 14 Apr 2014 17:27:02 +0200 Subject: [PATCH 14/52] ChemSpider parser now handles the Predicted - ACD/Labs tab for scraping properties --- FourmiCrawler/parsers/ChemSpider.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/FourmiCrawler/parsers/ChemSpider.py b/FourmiCrawler/parsers/ChemSpider.py index 1dbc994..99fd9e3 100644 --- a/FourmiCrawler/parsers/ChemSpider.py +++ b/FourmiCrawler/parsers/ChemSpider.py @@ -33,6 +33,22 @@ class ChemSpider(Parser): def parse_properties(self, sel): requests = [] properties = [] + + td_list = sel.xpath('.//table[@id="acdlabs-table"]//td').xpath('normalize-space(string())') + prop_names = td_list[::2] + prop_values = td_list[1::2] + for i, prop_name in enumerate(prop_names): + new_prop = Result() + new_prop['attribute'] = prop_name.extract().encode('utf-8') + new_prop['value'] = prop_values[i].extract().encode('utf-8') + new_prop['source'] = 'ChemSpider Predicted - ACD/Labs Tab' + new_prop['reliability'] = None + new_prop['conditions'] = None + properties.append(new_prop) + log.msg('CS prop: |%s| |%s| |%s|' \ + % (new_prop['attribute'],new_prop['value'], new_prop['source']), + level=log.WARNING) + scraped_list = sel.xpath('.//li[span="Experimental Physico-chemical Properties"]//li/table/tr/td') if not scraped_list: return None @@ -51,6 +67,7 @@ class ChemSpider(Parser): log.msg('CS prop: |%s| |%s| |%s|' \ % (new_prop['attribute'],new_prop['value'], new_prop['source']), level=log.WARNING) + return properties def parse_synonyms(self, sel): From 8e46762a9e9308275f60cc872f9202f386d48f43 Mon Sep 17 00:00:00 2001 From: RTB Date: Tue, 15 Apr 2014 18:56:38 +0200 Subject: [PATCH 15/52] fix: if no experimental data, return predicted acd/labs data instead of None --- FourmiCrawler/parsers/ChemSpider.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/FourmiCrawler/parsers/ChemSpider.py b/FourmiCrawler/parsers/ChemSpider.py index 99fd9e3..47794d0 100644 --- a/FourmiCrawler/parsers/ChemSpider.py +++ b/FourmiCrawler/parsers/ChemSpider.py @@ -51,7 +51,7 @@ class ChemSpider(Parser): scraped_list = sel.xpath('.//li[span="Experimental Physico-chemical Properties"]//li/table/tr/td') if not scraped_list: - return None + return properties property_name = scraped_list.pop(0).xpath('span/text()').extract()[0].rstrip() for line in scraped_list: if line.xpath('span/text()'): From c1b5f810cbb2e32c55edf95cbd21dd475b7ad54f Mon Sep 17 00:00:00 2001 From: Rob tB Date: Wed, 16 Apr 2014 11:53:59 +0200 Subject: [PATCH 16/52] unused Result properties are now empty string instead of None --- FourmiCrawler/parsers/ChemSpider.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/FourmiCrawler/parsers/ChemSpider.py b/FourmiCrawler/parsers/ChemSpider.py index 47794d0..9617dc3 100644 --- a/FourmiCrawler/parsers/ChemSpider.py +++ b/FourmiCrawler/parsers/ChemSpider.py @@ -42,8 +42,8 @@ class ChemSpider(Parser): new_prop['attribute'] = prop_name.extract().encode('utf-8') new_prop['value'] = prop_values[i].extract().encode('utf-8') new_prop['source'] = 'ChemSpider Predicted - ACD/Labs Tab' - new_prop['reliability'] = None - new_prop['conditions'] = None + new_prop['reliability'] = '' + new_prop['conditions'] = '' properties.append(new_prop) log.msg('CS prop: |%s| |%s| |%s|' \ % (new_prop['attribute'],new_prop['value'], new_prop['source']), @@ -61,8 +61,8 @@ class ChemSpider(Parser): new_prop['attribute'] = property_name new_prop['value'] = line.xpath('text()').extract()[0].rstrip() new_prop['source'] = line.xpath('strong/text()').extract()[0].rstrip() - new_prop['reliability'] = None - new_prop['conditions'] = None + new_prop['reliability'] = '' + new_prop['conditions'] = '' properties.append(new_prop) log.msg('CS prop: |%s| |%s| |%s|' \ % (new_prop['attribute'],new_prop['value'], new_prop['source']), @@ -94,7 +94,7 @@ class ChemSpider(Parser): synonym['value'] = name synonym['source'] = 'ChemSpider' synonym['reliability'] = reliability - synonym['conditions'] = None + synonym['conditions'] = '' return synonym From 93a6f098a91b757805fcd78605bab7133bae1ee9 Mon Sep 17 00:00:00 2001 From: Rob tB Date: Wed, 16 Apr 2014 13:28:59 +0200 Subject: [PATCH 17/52] log messages are now DEBUG instead of WARNING --- FourmiCrawler/parsers/ChemSpider.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/FourmiCrawler/parsers/ChemSpider.py b/FourmiCrawler/parsers/ChemSpider.py index 9617dc3..1988aef 100644 --- a/FourmiCrawler/parsers/ChemSpider.py +++ b/FourmiCrawler/parsers/ChemSpider.py @@ -47,7 +47,7 @@ class ChemSpider(Parser): properties.append(new_prop) log.msg('CS prop: |%s| |%s| |%s|' \ % (new_prop['attribute'],new_prop['value'], new_prop['source']), - level=log.WARNING) + level=log.DEBUG) scraped_list = sel.xpath('.//li[span="Experimental Physico-chemical Properties"]//li/table/tr/td') if not scraped_list: @@ -66,7 +66,7 @@ class ChemSpider(Parser): properties.append(new_prop) log.msg('CS prop: |%s| |%s| |%s|' \ % (new_prop['attribute'],new_prop['value'], new_prop['source']), - level=log.WARNING) + level=log.DEBUG) return properties @@ -87,7 +87,7 @@ class ChemSpider(Parser): return requests def new_synonym(self, name, reliability): - log.msg('CS synonym: %s (%s)' % (name, reliability), level=log.WARNING) + log.msg('CS synonym: %s (%s)' % (name, reliability), level=log.DEBUG) self.ignore_list.append(name) synonym = Result() synonym['attribute'] = 'synonym' @@ -100,17 +100,17 @@ class ChemSpider(Parser): def parse_searchrequest(self, response): sel = Selector(response) - log.msg('chemspider parse_searchrequest', level=log.WARNING) + log.msg('chemspider parse_searchrequest', level=log.DEBUG) sel.register_namespace('cs', 'http://www.chemspider.com/') csid = sel.xpath('.//cs:int/text()').extract()[0] #TODO: handle multiple csids in case of vague search term structure_url = self.website[:-1] + self.structure % csid - log.msg('chemspider URL: %s' % structure_url, level=log.WARNING) + log.msg('chemspider URL: %s' % structure_url, level=log.DEBUG) return Request(structure_url, callback=self.parse) def new_compound_request(self,compound): if compound in self.ignore_list: #TODO: add regular expression return None searchurl = self.website[:-1] + self.search % compound - log.msg('chemspider compound', level=log.WARNING) + log.msg('chemspider compound', level=log.DEBUG) return Request(url=searchurl, callback=self.parse_searchrequest) From 87282fc572ca8004f913c44b97a6e35bcee78f83 Mon Sep 17 00:00:00 2001 From: Rob tB Date: Wed, 16 Apr 2014 14:26:27 +0200 Subject: [PATCH 18/52] new properties in parse_properties now use dictionary syntax --- FourmiCrawler/parsers/ChemSpider.py | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/FourmiCrawler/parsers/ChemSpider.py b/FourmiCrawler/parsers/ChemSpider.py index 1988aef..4c45c92 100644 --- a/FourmiCrawler/parsers/ChemSpider.py +++ b/FourmiCrawler/parsers/ChemSpider.py @@ -38,12 +38,13 @@ class ChemSpider(Parser): prop_names = td_list[::2] prop_values = td_list[1::2] for i, prop_name in enumerate(prop_names): - new_prop = Result() - new_prop['attribute'] = prop_name.extract().encode('utf-8') - new_prop['value'] = prop_values[i].extract().encode('utf-8') - new_prop['source'] = 'ChemSpider Predicted - ACD/Labs Tab' - new_prop['reliability'] = '' - new_prop['conditions'] = '' + new_prop = Result({ + 'attribute': prop_name.extract().encode('utf-8'), + 'value': prop_values[i].extract().encode('utf-8'), + 'source': 'ChemSpider Predicted - ACD/Labs Tab', + 'reliability': '', + 'conditions': '' + }) properties.append(new_prop) log.msg('CS prop: |%s| |%s| |%s|' \ % (new_prop['attribute'],new_prop['value'], new_prop['source']), @@ -57,12 +58,13 @@ class ChemSpider(Parser): if line.xpath('span/text()'): property_name = line.xpath('span/text()').extract()[0].rstrip() else: - new_prop = Result() - new_prop['attribute'] = property_name - new_prop['value'] = line.xpath('text()').extract()[0].rstrip() - new_prop['source'] = line.xpath('strong/text()').extract()[0].rstrip() - new_prop['reliability'] = '' - new_prop['conditions'] = '' + new_prop = Result({ + 'attribute': property_name, + 'value': line.xpath('text()').extract()[0].rstrip(), + 'source': line.xpath('strong/text()').extract()[0].rstrip(), + 'reliability': '', + 'conditions': '' + }) properties.append(new_prop) log.msg('CS prop: |%s| |%s| |%s|' \ % (new_prop['attribute'],new_prop['value'], new_prop['source']), From 7fc980befe42132ba8468dfa9b6f19be5754ba0d Mon Sep 17 00:00:00 2001 From: Rob tB Date: Wed, 16 Apr 2014 15:02:37 +0200 Subject: [PATCH 19/52] chemspider should now only generate new Requests for wikipedia links from 'expert confirmed' synonyms --- FourmiCrawler/parsers/ChemSpider.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/FourmiCrawler/parsers/ChemSpider.py b/FourmiCrawler/parsers/ChemSpider.py index 4c45c92..a36a42f 100644 --- a/FourmiCrawler/parsers/ChemSpider.py +++ b/FourmiCrawler/parsers/ChemSpider.py @@ -25,7 +25,7 @@ class ChemSpider(Parser): requests.extend(requests_synonyms) requests_properties = self.parse_properties(sel) requests.extend(requests_properties) - for wiki_url in sel.xpath('.//a[@title="Wiki"]/@href').extract(): + for wiki_url in sel.xpath('.//p[@class="syn"][strong]/a[@title="Wiki"]/@href').extract(): requests.append( Request(url=wiki_url) ) return requests From 2d314aee6ab03a2cb2a1dc6c48e43a19a911a631 Mon Sep 17 00:00:00 2001 From: Rob tB Date: Wed, 16 Apr 2014 15:21:33 +0200 Subject: [PATCH 20/52] created stub to parse ExtendedCompoundInfo from ChemSpider MassSpec API --- FourmiCrawler/parsers/ChemSpider.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/FourmiCrawler/parsers/ChemSpider.py b/FourmiCrawler/parsers/ChemSpider.py index a36a42f..a4b3869 100644 --- a/FourmiCrawler/parsers/ChemSpider.py +++ b/FourmiCrawler/parsers/ChemSpider.py @@ -15,6 +15,7 @@ class ChemSpider(Parser): search = "Search.asmx/SimpleSearch?query=%s&token=052bfd06-5ce4-43d6-bf12-89eabefd2338" structure = "Chemical-Structure.%s.html" + extendedinfo = "MassSpecAPI.asmx/GetExtendedCompoundInfo?query=%s&token=052bfd06-5ce4-43d6-bf12-89eabefd2338" ignore_list = [] @@ -99,6 +100,8 @@ class ChemSpider(Parser): synonym['conditions'] = '' return synonym + def parse_extendedinfo(self, response): + pass def parse_searchrequest(self, response): sel = Selector(response) @@ -107,8 +110,10 @@ class ChemSpider(Parser): csid = sel.xpath('.//cs:int/text()').extract()[0] #TODO: handle multiple csids in case of vague search term structure_url = self.website[:-1] + self.structure % csid + extendedinfo_url = self.website[:-1] + self.extendedinfo % csid log.msg('chemspider URL: %s' % structure_url, level=log.DEBUG) - return Request(structure_url, callback=self.parse) + return [Request(url=structure_url, callback=self.parse), + Request(url=extendedinfo_url, callback=self.parse_extendedinfo)] def new_compound_request(self,compound): if compound in self.ignore_list: #TODO: add regular expression From caf7d3df4ee7ad8e8333a82b7a3f55948c2b116b Mon Sep 17 00:00:00 2001 From: Rob tB Date: Wed, 16 Apr 2014 15:27:10 +0200 Subject: [PATCH 21/52] fixed ExtendedCompoundInfo url to have csid parameter instead of query --- FourmiCrawler/parsers/ChemSpider.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/FourmiCrawler/parsers/ChemSpider.py b/FourmiCrawler/parsers/ChemSpider.py index a4b3869..d468c39 100644 --- a/FourmiCrawler/parsers/ChemSpider.py +++ b/FourmiCrawler/parsers/ChemSpider.py @@ -15,7 +15,7 @@ class ChemSpider(Parser): search = "Search.asmx/SimpleSearch?query=%s&token=052bfd06-5ce4-43d6-bf12-89eabefd2338" structure = "Chemical-Structure.%s.html" - extendedinfo = "MassSpecAPI.asmx/GetExtendedCompoundInfo?query=%s&token=052bfd06-5ce4-43d6-bf12-89eabefd2338" + extendedinfo = "MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token=052bfd06-5ce4-43d6-bf12-89eabefd2338" ignore_list = [] From 9a78e186bc007a0ce75041e730310860b2bdf682 Mon Sep 17 00:00:00 2001 From: Rob tB Date: Wed, 16 Apr 2014 16:22:47 +0200 Subject: [PATCH 22/52] chemspider parser now grabs data from ExtendedCompoundInfo() of chemspider API (no units) --- FourmiCrawler/parsers/ChemSpider.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/FourmiCrawler/parsers/ChemSpider.py b/FourmiCrawler/parsers/ChemSpider.py index d468c39..ae91e8b 100644 --- a/FourmiCrawler/parsers/ChemSpider.py +++ b/FourmiCrawler/parsers/ChemSpider.py @@ -101,7 +101,20 @@ class ChemSpider(Parser): return synonym def parse_extendedinfo(self, response): - pass + sel = Selector(response) + properties = [] + names = sel.xpath('*').xpath('name()').extract() + values = sel.xpath('*').xpath('text()').extract() + for (name, value) in zip(names,values): + result = Result({ + 'attribute': name, + 'value': value, + 'source': 'ChemSpider', + 'reliability': '', + 'conditions': '' + }) + properties.append(result) + return properties def parse_searchrequest(self, response): sel = Selector(response) From 2e95d35283b33bb8134e1a5c2e4fd699f37854fa Mon Sep 17 00:00:00 2001 From: RTB Date: Thu, 17 Apr 2014 21:30:53 +0200 Subject: [PATCH 23/52] modified parse_synonyms and new_synonym to include a Selector for future edits --- FourmiCrawler/parsers/ChemSpider.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/FourmiCrawler/parsers/ChemSpider.py b/FourmiCrawler/parsers/ChemSpider.py index ae91e8b..8dc0103 100644 --- a/FourmiCrawler/parsers/ChemSpider.py +++ b/FourmiCrawler/parsers/ChemSpider.py @@ -76,12 +76,17 @@ class ChemSpider(Parser): def parse_synonyms(self, sel): requests = [] synonyms = [] - for syn in sel.xpath('//p[@class="syn"]/strong/text()').extract(): - synonyms.append( self.new_synonym( syn, 'high' ) ) - for syn in sel.xpath('//p[@class="syn"]/span[@class="synonym_confirmed"]/text()').extract(): - synonyms.append( self.new_synonym( syn, 'medium' ) ) - for syn in sel.xpath('//p[@class="syn"]/span[@class=""]/text()').extract(): - synonyms.append( self.new_synonym( syn, 'low' ) ) + for syn in sel.xpath('//p[@class="syn"][strong]'): + name = syn.xpath('strong/text()').extract()[0] + synonyms.append(self.new_synonym(syn, name, 'high')) + for syn in sel.xpath( + '//p[@class="syn"][span[@class="synonym_confirmed"]]'): + name = syn.xpath( + 'span[@class="synonym_confirmed"]/text()').extract()[0] + synonyms.append(self.new_synonym(syn, name, 'medium')) + for syn in sel.xpath('//p[@class="syn"][span[@class=""]]'): + name = syn.xpath('span[@class=""]/text()').extract()[0] + synonyms.append(self.new_synonym(syn, name, 'low')) for synonym in synonyms: if synonym['reliability'] == 'high': @@ -89,7 +94,7 @@ class ChemSpider(Parser): return requests - def new_synonym(self, name, reliability): + def new_synonym(self, sel, name, reliability): log.msg('CS synonym: %s (%s)' % (name, reliability), level=log.DEBUG) self.ignore_list.append(name) synonym = Result() From 4f2c046c9c1d8d7a37e9d8214ec59ead11e1b95c Mon Sep 17 00:00:00 2001 From: RTB Date: Thu, 17 Apr 2014 22:06:45 +0200 Subject: [PATCH 24/52] rewrote parse_synonyms and new_synonym to use an internal dictionary structure --- FourmiCrawler/parsers/ChemSpider.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/FourmiCrawler/parsers/ChemSpider.py b/FourmiCrawler/parsers/ChemSpider.py index 8dc0103..7141839 100644 --- a/FourmiCrawler/parsers/ChemSpider.py +++ b/FourmiCrawler/parsers/ChemSpider.py @@ -78,31 +78,29 @@ class ChemSpider(Parser): synonyms = [] for syn in sel.xpath('//p[@class="syn"][strong]'): name = syn.xpath('strong/text()').extract()[0] - synonyms.append(self.new_synonym(syn, name, 'high')) + synonyms.append(self.new_synonym(syn, name, 'expert')) for syn in sel.xpath( '//p[@class="syn"][span[@class="synonym_confirmed"]]'): name = syn.xpath( 'span[@class="synonym_confirmed"]/text()').extract()[0] - synonyms.append(self.new_synonym(syn, name, 'medium')) + synonyms.append(self.new_synonym(syn, name, 'user')) for syn in sel.xpath('//p[@class="syn"][span[@class=""]]'): name = syn.xpath('span[@class=""]/text()').extract()[0] - synonyms.append(self.new_synonym(syn, name, 'low')) + synonyms.append(self.new_synonym(syn, name, 'nonvalidated')) for synonym in synonyms: - if synonym['reliability'] == 'high': - self._Parser__spider.get_synonym_requests(synonym['value']) + if synonym['category'] == 'expert': + self._Parser__spider.get_synonym_requests(synonym['name']) return requests def new_synonym(self, sel, name, reliability): log.msg('CS synonym: %s (%s)' % (name, reliability), level=log.DEBUG) self.ignore_list.append(name) - synonym = Result() - synonym['attribute'] = 'synonym' - synonym['value'] = name - synonym['source'] = 'ChemSpider' - synonym['reliability'] = reliability - synonym['conditions'] = '' + synonym = { + 'name': name, + 'category': category, + } return synonym def parse_extendedinfo(self, response): From ce5eeb56a64fcf96b228c180e878dc568a94a3fc Mon Sep 17 00:00:00 2001 From: RTB Date: Thu, 17 Apr 2014 22:37:37 +0200 Subject: [PATCH 25/52] added scraping of synonym language --- FourmiCrawler/parsers/ChemSpider.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/FourmiCrawler/parsers/ChemSpider.py b/FourmiCrawler/parsers/ChemSpider.py index 7141839..98f0c05 100644 --- a/FourmiCrawler/parsers/ChemSpider.py +++ b/FourmiCrawler/parsers/ChemSpider.py @@ -94,13 +94,20 @@ class ChemSpider(Parser): return requests - def new_synonym(self, sel, name, reliability): - log.msg('CS synonym: %s (%s)' % (name, reliability), level=log.DEBUG) + def new_synonym(self, sel, name, category): self.ignore_list.append(name) + language = sel.xpath('span[@class="synonym_language"]/text()').extract() + if language: + language = language[0][1:-1] + else: + language = 'English' synonym = { 'name': name, 'category': category, + 'language': language } + log.msg('CS synonym: %s (%s) (%s)' % (name, category, language), + level=log.DEBUG) return synonym def parse_extendedinfo(self, response): From 04751b6670a5814b9b20c984eb59cd9cd95d96b0 Mon Sep 17 00:00:00 2001 From: RTB Date: Thu, 17 Apr 2014 22:47:43 +0200 Subject: [PATCH 26/52] chemspider parser now only emits synonyms labeled as 'English' --- FourmiCrawler/parsers/ChemSpider.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/FourmiCrawler/parsers/ChemSpider.py b/FourmiCrawler/parsers/ChemSpider.py index 98f0c05..39ceb3b 100644 --- a/FourmiCrawler/parsers/ChemSpider.py +++ b/FourmiCrawler/parsers/ChemSpider.py @@ -89,7 +89,8 @@ class ChemSpider(Parser): synonyms.append(self.new_synonym(syn, name, 'nonvalidated')) for synonym in synonyms: - if synonym['category'] == 'expert': + if synonym['category'] == 'expert' and + synonym['language'] == 'English': self._Parser__spider.get_synonym_requests(synonym['name']) return requests From 119d48890d3d4a1315ae4419d7862a17d1d2f81e Mon Sep 17 00:00:00 2001 From: RTB Date: Fri, 18 Apr 2014 12:14:54 +0200 Subject: [PATCH 27/52] fixed conditional for emitting synonyms, it compiles again --- FourmiCrawler/parsers/ChemSpider.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/FourmiCrawler/parsers/ChemSpider.py b/FourmiCrawler/parsers/ChemSpider.py index 39ceb3b..c888bd1 100644 --- a/FourmiCrawler/parsers/ChemSpider.py +++ b/FourmiCrawler/parsers/ChemSpider.py @@ -88,10 +88,10 @@ class ChemSpider(Parser): name = syn.xpath('span[@class=""]/text()').extract()[0] synonyms.append(self.new_synonym(syn, name, 'nonvalidated')) - for synonym in synonyms: - if synonym['category'] == 'expert' and - synonym['language'] == 'English': - self._Parser__spider.get_synonym_requests(synonym['name']) + for syn in synonyms: + if (syn['category'] == 'expert' and syn['language'] == 'English'): + log.msg('CS emit synonym: %s' % syn['name'], level=log.DEBUG) + self._Parser__spider.get_synonym_requests(syn['name']) return requests From ae21fa7c6719dc5055282f1bc11f7c78e58f30e0 Mon Sep 17 00:00:00 2001 From: RTB Date: Fri, 18 Apr 2014 13:16:22 +0200 Subject: [PATCH 28/52] chemspider now scrapes for reference data on synonyms --- FourmiCrawler/parsers/ChemSpider.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/FourmiCrawler/parsers/ChemSpider.py b/FourmiCrawler/parsers/ChemSpider.py index c888bd1..3b22ce4 100644 --- a/FourmiCrawler/parsers/ChemSpider.py +++ b/FourmiCrawler/parsers/ChemSpider.py @@ -102,13 +102,26 @@ class ChemSpider(Parser): language = language[0][1:-1] else: language = 'English' + log.msg('CS synonym: %s (%s) (%s)' % (name, category, language), + level=log.DEBUG) + references = [] + for ref in sel.xpath('span[@class="synonym_ref"]'): + refname = ref.xpath('normalize-space(string())') + references.append({'name': refname.extract()[0][1:-1], 'URI': ''}) + for ref in sel.xpath('a[@class="synonym_ref"]'): + references.append({ + 'name': ref.xpath('@title').extract()[0], + 'URI': ref.xpath('@href').extract()[0] + }) + for ref in references: + log.msg('CS synonym ref: %s %s' % (ref['name'], ref['URI']), + level=log.DEBUG) synonym = { 'name': name, 'category': category, - 'language': language + 'language': language, + 'references': references } - log.msg('CS synonym: %s (%s) (%s)' % (name, category, language), - level=log.DEBUG) return synonym def parse_extendedinfo(self, response): From 9389af99ba7b66c5bec99d8986444bbcc60d608e Mon Sep 17 00:00:00 2001 From: RTB Date: Fri, 18 Apr 2014 13:17:24 +0200 Subject: [PATCH 29/52] removed manual Requests for wikipedia URLs as wikipedia parser handles those through synonyms --- FourmiCrawler/parsers/ChemSpider.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/FourmiCrawler/parsers/ChemSpider.py b/FourmiCrawler/parsers/ChemSpider.py index 3b22ce4..cf4bae7 100644 --- a/FourmiCrawler/parsers/ChemSpider.py +++ b/FourmiCrawler/parsers/ChemSpider.py @@ -26,8 +26,6 @@ class ChemSpider(Parser): requests.extend(requests_synonyms) requests_properties = self.parse_properties(sel) requests.extend(requests_properties) - for wiki_url in sel.xpath('.//p[@class="syn"][strong]/a[@title="Wiki"]/@href').extract(): - requests.append( Request(url=wiki_url) ) return requests From a4a21f2578f8b0eefb66c74fe1a51c07923c2885 Mon Sep 17 00:00:00 2001 From: RTB Date: Fri, 18 Apr 2014 13:19:05 +0200 Subject: [PATCH 30/52] changed default reliability from empty string to Unknown as per UML design --- FourmiCrawler/parsers/ChemSpider.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/FourmiCrawler/parsers/ChemSpider.py b/FourmiCrawler/parsers/ChemSpider.py index cf4bae7..7f01323 100644 --- a/FourmiCrawler/parsers/ChemSpider.py +++ b/FourmiCrawler/parsers/ChemSpider.py @@ -41,7 +41,7 @@ class ChemSpider(Parser): 'attribute': prop_name.extract().encode('utf-8'), 'value': prop_values[i].extract().encode('utf-8'), 'source': 'ChemSpider Predicted - ACD/Labs Tab', - 'reliability': '', + 'reliability': 'Unknown', 'conditions': '' }) properties.append(new_prop) @@ -61,7 +61,7 @@ class ChemSpider(Parser): 'attribute': property_name, 'value': line.xpath('text()').extract()[0].rstrip(), 'source': line.xpath('strong/text()').extract()[0].rstrip(), - 'reliability': '', + 'reliability': 'Unknown', 'conditions': '' }) properties.append(new_prop) @@ -132,7 +132,7 @@ class ChemSpider(Parser): 'attribute': name, 'value': value, 'source': 'ChemSpider', - 'reliability': '', + 'reliability': 'Unknown', 'conditions': '' }) properties.append(result) From bf4a5bb41f44448adfac478f961b0b5c2a2d3f37 Mon Sep 17 00:00:00 2001 From: RTB Date: Fri, 18 Apr 2014 13:36:33 +0200 Subject: [PATCH 31/52] added scraping of synonyms labeled as 'synonym_cn' --- FourmiCrawler/parsers/ChemSpider.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/FourmiCrawler/parsers/ChemSpider.py b/FourmiCrawler/parsers/ChemSpider.py index 7f01323..fd2b84c 100644 --- a/FourmiCrawler/parsers/ChemSpider.py +++ b/FourmiCrawler/parsers/ChemSpider.py @@ -74,6 +74,9 @@ class ChemSpider(Parser): def parse_synonyms(self, sel): requests = [] synonyms = [] + for syn in sel.xpath('//p[@class="syn"][span[@class="synonym_cn"]]'): + name = syn.xpath('span[@class="synonym_cn"]/text()').extract()[0] + synonyms.append(self.new_synonym(syn, name, 'expert')) for syn in sel.xpath('//p[@class="syn"][strong]'): name = syn.xpath('strong/text()').extract()[0] synonyms.append(self.new_synonym(syn, name, 'expert')) From 75d248e6cfd9b0b7ff3df4a29af4041d5c236557 Mon Sep 17 00:00:00 2001 From: RTB Date: Fri, 18 Apr 2014 13:45:32 +0200 Subject: [PATCH 32/52] changed for loop in parse_properties to use zip instead of enumerate --- FourmiCrawler/parsers/ChemSpider.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/FourmiCrawler/parsers/ChemSpider.py b/FourmiCrawler/parsers/ChemSpider.py index fd2b84c..afae427 100644 --- a/FourmiCrawler/parsers/ChemSpider.py +++ b/FourmiCrawler/parsers/ChemSpider.py @@ -36,10 +36,10 @@ class ChemSpider(Parser): td_list = sel.xpath('.//table[@id="acdlabs-table"]//td').xpath('normalize-space(string())') prop_names = td_list[::2] prop_values = td_list[1::2] - for i, prop_name in enumerate(prop_names): + for (prop_name, prop_value) in zip(prop_names, prop_values): new_prop = Result({ 'attribute': prop_name.extract().encode('utf-8'), - 'value': prop_values[i].extract().encode('utf-8'), + 'value': prop_value.extract().encode('utf-8'), 'source': 'ChemSpider Predicted - ACD/Labs Tab', 'reliability': 'Unknown', 'conditions': '' From 22c765b6e567132d0b43b5a5d2dfe3d601e24ebb Mon Sep 17 00:00:00 2001 From: RTB Date: Fri, 18 Apr 2014 14:08:48 +0200 Subject: [PATCH 33/52] simplified setting of Results for Predicted ACD/Labs tab --- FourmiCrawler/parsers/ChemSpider.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/FourmiCrawler/parsers/ChemSpider.py b/FourmiCrawler/parsers/ChemSpider.py index afae427..a15daaf 100644 --- a/FourmiCrawler/parsers/ChemSpider.py +++ b/FourmiCrawler/parsers/ChemSpider.py @@ -3,6 +3,7 @@ from scrapy import log from scrapy.http import Request from scrapy.selector import Selector from FourmiCrawler.items import Result +import re """ This parser will manage searching for chemicals through the ChemsSpider API, @@ -37,9 +38,12 @@ class ChemSpider(Parser): prop_names = td_list[::2] prop_values = td_list[1::2] for (prop_name, prop_value) in zip(prop_names, prop_values): + prop_name = prop_name.extract().encode('utf-8') + prop_value = prop_value.extract().encode('utf-8') + new_prop = Result({ - 'attribute': prop_name.extract().encode('utf-8'), - 'value': prop_value.extract().encode('utf-8'), + 'attribute': prop_name, + 'value': prop_value, 'source': 'ChemSpider Predicted - ACD/Labs Tab', 'reliability': 'Unknown', 'conditions': '' From cd8a64816f5f455569482fb19bfbf72e78f7cba2 Mon Sep 17 00:00:00 2001 From: RTB Date: Fri, 18 Apr 2014 14:10:53 +0200 Subject: [PATCH 34/52] removed colon at end of attributes in Experimental and Predicted ACD/labs tabs --- FourmiCrawler/parsers/ChemSpider.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/FourmiCrawler/parsers/ChemSpider.py b/FourmiCrawler/parsers/ChemSpider.py index a15daaf..169e836 100644 --- a/FourmiCrawler/parsers/ChemSpider.py +++ b/FourmiCrawler/parsers/ChemSpider.py @@ -42,7 +42,7 @@ class ChemSpider(Parser): prop_value = prop_value.extract().encode('utf-8') new_prop = Result({ - 'attribute': prop_name, + 'attribute': prop_name[:-1], 'value': prop_value, 'source': 'ChemSpider Predicted - ACD/Labs Tab', 'reliability': 'Unknown', @@ -62,7 +62,7 @@ class ChemSpider(Parser): property_name = line.xpath('span/text()').extract()[0].rstrip() else: new_prop = Result({ - 'attribute': property_name, + 'attribute': property_name[:-1], 'value': line.xpath('text()').extract()[0].rstrip(), 'source': line.xpath('strong/text()').extract()[0].rstrip(), 'reliability': 'Unknown', From 3bf8dccf18d5d7f7b5a7d7b5138ffc82d51b310a Mon Sep 17 00:00:00 2001 From: RTB Date: Fri, 18 Apr 2014 14:59:56 +0200 Subject: [PATCH 35/52] properties from Predicted - ACD/Labs tab now include conditions from attribute variable --- FourmiCrawler/parsers/ChemSpider.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/FourmiCrawler/parsers/ChemSpider.py b/FourmiCrawler/parsers/ChemSpider.py index 169e836..3c88728 100644 --- a/FourmiCrawler/parsers/ChemSpider.py +++ b/FourmiCrawler/parsers/ChemSpider.py @@ -38,15 +38,21 @@ class ChemSpider(Parser): prop_names = td_list[::2] prop_values = td_list[1::2] for (prop_name, prop_value) in zip(prop_names, prop_values): - prop_name = prop_name.extract().encode('utf-8') + prop_name = prop_name.extract().encode('utf-8')[:-1] prop_value = prop_value.extract().encode('utf-8') + prop_conditions = '' + + m = re.match(r'(.*) \((.*)\)', prop_name) + if m: + prop_name = m.group(1) + prop_conditions = m.group(2) new_prop = Result({ - 'attribute': prop_name[:-1], + 'attribute': prop_name, 'value': prop_value, 'source': 'ChemSpider Predicted - ACD/Labs Tab', 'reliability': 'Unknown', - 'conditions': '' + 'conditions': prop_conditions }) properties.append(new_prop) log.msg('CS prop: |%s| |%s| |%s|' \ From f2cacb79eb6d193dacbf95c03b83e8338840e819 Mon Sep 17 00:00:00 2001 From: RTB Date: Fri, 18 Apr 2014 15:03:06 +0200 Subject: [PATCH 36/52] properties from Predicted - ACD/Labs tab now include conditions from value variable --- FourmiCrawler/parsers/ChemSpider.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/FourmiCrawler/parsers/ChemSpider.py b/FourmiCrawler/parsers/ChemSpider.py index 3c88728..83be265 100644 --- a/FourmiCrawler/parsers/ChemSpider.py +++ b/FourmiCrawler/parsers/ChemSpider.py @@ -47,6 +47,11 @@ class ChemSpider(Parser): prop_name = m.group(1) prop_conditions = m.group(2) + m = re.match(r'(.*) at (.*)', prop_value) + if m: + prop_value = m.group(1) + prop_conditions = m.group(2) + new_prop = Result({ 'attribute': prop_name, 'value': prop_value, From c1c7cfc117fc80c58b8fe4c87670b53267db75a5 Mon Sep 17 00:00:00 2001 From: RTB Date: Fri, 18 Apr 2014 15:12:22 +0200 Subject: [PATCH 37/52] edited global strings to be consistent (PEP-8) --- FourmiCrawler/parsers/ChemSpider.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/FourmiCrawler/parsers/ChemSpider.py b/FourmiCrawler/parsers/ChemSpider.py index 83be265..6fee3aa 100644 --- a/FourmiCrawler/parsers/ChemSpider.py +++ b/FourmiCrawler/parsers/ChemSpider.py @@ -12,11 +12,13 @@ The token required for the API should be in a configuration file somewhere. """ class ChemSpider(Parser): - website = "http://www.chemspider.com/*" + website = 'http://www.chemspider.com/*' - search = "Search.asmx/SimpleSearch?query=%s&token=052bfd06-5ce4-43d6-bf12-89eabefd2338" - structure = "Chemical-Structure.%s.html" - extendedinfo = "MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token=052bfd06-5ce4-43d6-bf12-89eabefd2338" + search = ('Search.asmx/SimpleSearch?query=%s&token=' + '052bfd06-5ce4-43d6-bf12-89eabefd2338') + structure = 'Chemical-Structure.%s.html' + extendedinfo = ('MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token=' + '052bfd06-5ce4-43d6-bf12-89eabefd2338') ignore_list = [] From 9aae8d2d0730cf7dc31e8187a59bb0af51898aed Mon Sep 17 00:00:00 2001 From: RTB Date: Fri, 18 Apr 2014 15:38:07 +0200 Subject: [PATCH 38/52] chemspider parse_properties is now PEP-8 compliant, hopefully --- FourmiCrawler/parsers/ChemSpider.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/FourmiCrawler/parsers/ChemSpider.py b/FourmiCrawler/parsers/ChemSpider.py index 6fee3aa..c70f5f1 100644 --- a/FourmiCrawler/parsers/ChemSpider.py +++ b/FourmiCrawler/parsers/ChemSpider.py @@ -36,7 +36,8 @@ class ChemSpider(Parser): requests = [] properties = [] - td_list = sel.xpath('.//table[@id="acdlabs-table"]//td').xpath('normalize-space(string())') + td_list = sel.xpath('.//table[@id="acdlabs-table"]//td').xpath( + 'normalize-space(string())') prop_names = td_list[::2] prop_values = td_list[1::2] for (prop_name, prop_value) in zip(prop_names, prop_values): @@ -62,14 +63,16 @@ class ChemSpider(Parser): 'conditions': prop_conditions }) properties.append(new_prop) - log.msg('CS prop: |%s| |%s| |%s|' \ - % (new_prop['attribute'],new_prop['value'], new_prop['source']), - level=log.DEBUG) + log.msg('CS prop: |%s| |%s| |%s|' % + (new_prop['attribute'], new_prop['value'], new_prop['source']), + level=log.DEBUG) - scraped_list = sel.xpath('.//li[span="Experimental Physico-chemical Properties"]//li/table/tr/td') + scraped_list = sel.xpath('.//li[span="Experimental Physico-chemical ' + 'Properties"]//li/table/tr/td') if not scraped_list: return properties - property_name = scraped_list.pop(0).xpath('span/text()').extract()[0].rstrip() + property_name = scraped_list.pop(0).xpath( + 'span/text()').extract()[0].rstrip() for line in scraped_list: if line.xpath('span/text()'): property_name = line.xpath('span/text()').extract()[0].rstrip() @@ -77,14 +80,15 @@ class ChemSpider(Parser): new_prop = Result({ 'attribute': property_name[:-1], 'value': line.xpath('text()').extract()[0].rstrip(), - 'source': line.xpath('strong/text()').extract()[0].rstrip(), + 'source': line.xpath( + 'strong/text()').extract()[0].rstrip(), 'reliability': 'Unknown', 'conditions': '' }) properties.append(new_prop) - log.msg('CS prop: |%s| |%s| |%s|' \ - % (new_prop['attribute'],new_prop['value'], new_prop['source']), - level=log.DEBUG) + log.msg('CS prop: |%s| |%s| |%s|' % + (new_prop['attribute'], new_prop['value'], + new_prop['source']), level=log.DEBUG) return properties From 319e02871701c11d953044d534c1c65cb87e13b2 Mon Sep 17 00:00:00 2001 From: RTB Date: Fri, 18 Apr 2014 15:55:04 +0200 Subject: [PATCH 39/52] chemspider parse_properties a bit more PEP-8 compliant --- FourmiCrawler/parsers/ChemSpider.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/FourmiCrawler/parsers/ChemSpider.py b/FourmiCrawler/parsers/ChemSpider.py index c70f5f1..946f853 100644 --- a/FourmiCrawler/parsers/ChemSpider.py +++ b/FourmiCrawler/parsers/ChemSpider.py @@ -61,7 +61,7 @@ class ChemSpider(Parser): 'source': 'ChemSpider Predicted - ACD/Labs Tab', 'reliability': 'Unknown', 'conditions': prop_conditions - }) + }) properties.append(new_prop) log.msg('CS prop: |%s| |%s| |%s|' % (new_prop['attribute'], new_prop['value'], new_prop['source']), @@ -81,7 +81,7 @@ class ChemSpider(Parser): 'attribute': property_name[:-1], 'value': line.xpath('text()').extract()[0].rstrip(), 'source': line.xpath( - 'strong/text()').extract()[0].rstrip(), + 'strong/text()').extract()[0].rstrip(), 'reliability': 'Unknown', 'conditions': '' }) From 479182d77e49ddae0b7604ce4897a9de834364d2 Mon Sep 17 00:00:00 2001 From: RTB Date: Fri, 18 Apr 2014 16:07:06 +0200 Subject: [PATCH 40/52] chemspider new_synonym is now PEP-8 compliant --- FourmiCrawler/parsers/ChemSpider.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/FourmiCrawler/parsers/ChemSpider.py b/FourmiCrawler/parsers/ChemSpider.py index 946f853..70f0fb9 100644 --- a/FourmiCrawler/parsers/ChemSpider.py +++ b/FourmiCrawler/parsers/ChemSpider.py @@ -119,9 +119,9 @@ class ChemSpider(Parser): def new_synonym(self, sel, name, category): self.ignore_list.append(name) - language = sel.xpath('span[@class="synonym_language"]/text()').extract() + language = sel.xpath('span[@class="synonym_language"]/text()') if language: - language = language[0][1:-1] + language = language.extract()[0][1:-1] else: language = 'English' log.msg('CS synonym: %s (%s) (%s)' % (name, category, language), @@ -129,7 +129,10 @@ class ChemSpider(Parser): references = [] for ref in sel.xpath('span[@class="synonym_ref"]'): refname = ref.xpath('normalize-space(string())') - references.append({'name': refname.extract()[0][1:-1], 'URI': ''}) + references.append({ + 'name': refname.extract()[0][1:-1], + 'URI': '' + }) for ref in sel.xpath('a[@class="synonym_ref"]'): references.append({ 'name': ref.xpath('@title').extract()[0], @@ -137,13 +140,13 @@ class ChemSpider(Parser): }) for ref in references: log.msg('CS synonym ref: %s %s' % (ref['name'], ref['URI']), - level=log.DEBUG) + level=log.DEBUG) synonym = { - 'name': name, - 'category': category, - 'language': language, - 'references': references - } + 'name': name, + 'category': category, + 'language': language, + 'references': references + } return synonym def parse_extendedinfo(self, response): From fa22356cb277e12ed61e0ec50ed36f9c2021561a Mon Sep 17 00:00:00 2001 From: RTB Date: Fri, 18 Apr 2014 16:12:07 +0200 Subject: [PATCH 41/52] chemspider parse_extendedinfo is now PEP-8 compliant --- FourmiCrawler/parsers/ChemSpider.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/FourmiCrawler/parsers/ChemSpider.py b/FourmiCrawler/parsers/ChemSpider.py index 70f0fb9..35638f4 100644 --- a/FourmiCrawler/parsers/ChemSpider.py +++ b/FourmiCrawler/parsers/ChemSpider.py @@ -156,12 +156,12 @@ class ChemSpider(Parser): values = sel.xpath('*').xpath('text()').extract() for (name, value) in zip(names,values): result = Result({ - 'attribute': name, - 'value': value, - 'source': 'ChemSpider', - 'reliability': 'Unknown', - 'conditions': '' - }) + 'attribute': name, + 'value': value, + 'source': 'ChemSpider', + 'reliability': 'Unknown', + 'conditions': '' + }) properties.append(result) return properties From 074fbdf9e20c2d9bd2228eab4a2038e781aa477b Mon Sep 17 00:00:00 2001 From: RTB Date: Fri, 18 Apr 2014 16:13:44 +0200 Subject: [PATCH 42/52] changed source for properties by parse_extendedinfo to 'ChemSpider ExtendedCompoundInfo' --- FourmiCrawler/parsers/ChemSpider.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/FourmiCrawler/parsers/ChemSpider.py b/FourmiCrawler/parsers/ChemSpider.py index 35638f4..dac27cf 100644 --- a/FourmiCrawler/parsers/ChemSpider.py +++ b/FourmiCrawler/parsers/ChemSpider.py @@ -158,7 +158,7 @@ class ChemSpider(Parser): result = Result({ 'attribute': name, 'value': value, - 'source': 'ChemSpider', + 'source': 'ChemSpider ExtendedCompoundInfo', 'reliability': 'Unknown', 'conditions': '' }) From f18f23dfc6a20af9a1ca270466ff1d69ab9fa44e Mon Sep 17 00:00:00 2001 From: RTB Date: Fri, 18 Apr 2014 16:20:47 +0200 Subject: [PATCH 43/52] chemspider new_compound_request is now PEP-8 compliant --- FourmiCrawler/parsers/ChemSpider.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/FourmiCrawler/parsers/ChemSpider.py b/FourmiCrawler/parsers/ChemSpider.py index dac27cf..5ef5381 100644 --- a/FourmiCrawler/parsers/ChemSpider.py +++ b/FourmiCrawler/parsers/ChemSpider.py @@ -174,8 +174,10 @@ class ChemSpider(Parser): structure_url = self.website[:-1] + self.structure % csid extendedinfo_url = self.website[:-1] + self.extendedinfo % csid log.msg('chemspider URL: %s' % structure_url, level=log.DEBUG) - return [Request(url=structure_url, callback=self.parse), - Request(url=extendedinfo_url, callback=self.parse_extendedinfo)] + return [Request(url=structure_url, + callback=self.parse), + Request(url=extendedinfo_url, + callback=self.parse_extendedinfo)] def new_compound_request(self,compound): if compound in self.ignore_list: #TODO: add regular expression From 3862bfb7d8cf12af58d980ce6fb74b98d41c62d6 Mon Sep 17 00:00:00 2001 From: RTB Date: Fri, 18 Apr 2014 16:54:30 +0200 Subject: [PATCH 44/52] added comments for ChemSpider class, parse_properties, and parse_synonyms --- FourmiCrawler/parsers/ChemSpider.py | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/FourmiCrawler/parsers/ChemSpider.py b/FourmiCrawler/parsers/ChemSpider.py index 5ef5381..2d98544 100644 --- a/FourmiCrawler/parsers/ChemSpider.py +++ b/FourmiCrawler/parsers/ChemSpider.py @@ -5,13 +5,14 @@ from scrapy.selector import Selector from FourmiCrawler.items import Result import re -""" -This parser will manage searching for chemicals through the ChemsSpider API, -and parsing the resulting ChemSpider page. -The token required for the API should be in a configuration file somewhere. -""" class ChemSpider(Parser): - +"""ChemSpider scraper for synonyms and properties + +This parser will manage searching for chemicals through the +ChemsSpider API, and parsing the resulting ChemSpider page. +The token required for the API should be in a configuration file +somewhere. +""" website = 'http://www.chemspider.com/*' search = ('Search.asmx/SimpleSearch?query=%s&token=' @@ -33,23 +34,29 @@ class ChemSpider(Parser): return requests def parse_properties(self, sel): + """scrape Experimental Data and Predicted ACD/Labs tabs""" requests = [] properties = [] + # Predicted - ACD/Labs tab + # TODO: test if tab contains data, some chemicals do not have data here td_list = sel.xpath('.//table[@id="acdlabs-table"]//td').xpath( 'normalize-space(string())') prop_names = td_list[::2] prop_values = td_list[1::2] for (prop_name, prop_value) in zip(prop_names, prop_values): + # [:-1] is to remove the colon at the end, TODO: test for colon prop_name = prop_name.extract().encode('utf-8')[:-1] prop_value = prop_value.extract().encode('utf-8') prop_conditions = '' + # Match for condition in parentheses m = re.match(r'(.*) \((.*)\)', prop_name) if m: prop_name = m.group(1) prop_conditions = m.group(2) + # Match for condition in value seperated by an 'at' m = re.match(r'(.*) at (.*)', prop_value) if m: prop_value = m.group(1) @@ -67,10 +74,12 @@ class ChemSpider(Parser): (new_prop['attribute'], new_prop['value'], new_prop['source']), level=log.DEBUG) + # Experimental Data Tab, Physico-chemical properties in particular scraped_list = sel.xpath('.//li[span="Experimental Physico-chemical ' 'Properties"]//li/table/tr/td') if not scraped_list: return properties + # Format is: property name followed by a list of values property_name = scraped_list.pop(0).xpath( 'span/text()').extract()[0].rstrip() for line in scraped_list: @@ -93,23 +102,30 @@ class ChemSpider(Parser): return properties def parse_synonyms(self, sel): + """Scrape list of Names and Identifiers""" requests = [] synonyms = [] + + # Exact type for this is unknown, but equivalent to Validated by Expert for syn in sel.xpath('//p[@class="syn"][span[@class="synonym_cn"]]'): name = syn.xpath('span[@class="synonym_cn"]/text()').extract()[0] synonyms.append(self.new_synonym(syn, name, 'expert')) + # These synonyms are labeled by ChemSpider as "Validated by Experts" for syn in sel.xpath('//p[@class="syn"][strong]'): name = syn.xpath('strong/text()').extract()[0] synonyms.append(self.new_synonym(syn, name, 'expert')) + # These synonyms are labeled by ChemSpider as "Validated by Users" for syn in sel.xpath( '//p[@class="syn"][span[@class="synonym_confirmed"]]'): name = syn.xpath( 'span[@class="synonym_confirmed"]/text()').extract()[0] synonyms.append(self.new_synonym(syn, name, 'user')) + # These syonyms are labeled as "Non-validated" and assumed unreliable for syn in sel.xpath('//p[@class="syn"][span[@class=""]]'): name = syn.xpath('span[@class=""]/text()').extract()[0] synonyms.append(self.new_synonym(syn, name, 'nonvalidated')) + # TODO: confirm if English User-Validated synonyms are OK too for syn in synonyms: if (syn['category'] == 'expert' and syn['language'] == 'English'): log.msg('CS emit synonym: %s' % syn['name'], level=log.DEBUG) From 2ac6d1711d9665cab322c8c201dca8046abf4472 Mon Sep 17 00:00:00 2001 From: RTB Date: Fri, 18 Apr 2014 17:11:04 +0200 Subject: [PATCH 45/52] added comments for chemspider new_synonym --- FourmiCrawler/parsers/ChemSpider.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/FourmiCrawler/parsers/ChemSpider.py b/FourmiCrawler/parsers/ChemSpider.py index 2d98544..6fde538 100644 --- a/FourmiCrawler/parsers/ChemSpider.py +++ b/FourmiCrawler/parsers/ChemSpider.py @@ -134,15 +134,19 @@ somewhere. return requests def new_synonym(self, sel, name, category): + """Scrape for a single synonym at a given HTML tag""" self.ignore_list.append(name) language = sel.xpath('span[@class="synonym_language"]/text()') if language: + # The [1:-1] is to remove brackets around the language name language = language.extract()[0][1:-1] else: + # If language is not given, English is assumed, TODO: confirm language = 'English' log.msg('CS synonym: %s (%s) (%s)' % (name, category, language), level=log.DEBUG) references = [] + # A synonym can have multiple references, each optionally with link for ref in sel.xpath('span[@class="synonym_ref"]'): refname = ref.xpath('normalize-space(string())') references.append({ From 3c5dbc44dc6b73e5cc8e502107c2517876bce496 Mon Sep 17 00:00:00 2001 From: RTB Date: Fri, 18 Apr 2014 17:14:19 +0200 Subject: [PATCH 46/52] added comments for chemspider parse_extendedinfo --- FourmiCrawler/parsers/ChemSpider.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/FourmiCrawler/parsers/ChemSpider.py b/FourmiCrawler/parsers/ChemSpider.py index 6fde538..d405499 100644 --- a/FourmiCrawler/parsers/ChemSpider.py +++ b/FourmiCrawler/parsers/ChemSpider.py @@ -170,6 +170,7 @@ somewhere. return synonym def parse_extendedinfo(self, response): + """Scrape data from the ChemSpider GetExtendedCompoundInfo API""" sel = Selector(response) properties = [] names = sel.xpath('*').xpath('name()').extract() @@ -177,7 +178,7 @@ somewhere. for (name, value) in zip(names,values): result = Result({ 'attribute': name, - 'value': value, + 'value': value, #These values have no unit! 'source': 'ChemSpider ExtendedCompoundInfo', 'reliability': 'Unknown', 'conditions': '' From 63fb9f4733ac438e6a440b0320380c0564e580c6 Mon Sep 17 00:00:00 2001 From: RTB Date: Fri, 18 Apr 2014 17:33:00 +0200 Subject: [PATCH 47/52] added comment to parse_searchrequest and added optional todo for extract()[0] usage --- FourmiCrawler/parsers/ChemSpider.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/FourmiCrawler/parsers/ChemSpider.py b/FourmiCrawler/parsers/ChemSpider.py index d405499..2904ca2 100644 --- a/FourmiCrawler/parsers/ChemSpider.py +++ b/FourmiCrawler/parsers/ChemSpider.py @@ -5,6 +5,9 @@ from scrapy.selector import Selector from FourmiCrawler.items import Result import re +# TODO: Maybe clean up usage of '.extract()[0]', because it will raise an +# IndexError exception if the xpath matches nothing + class ChemSpider(Parser): """ChemSpider scraper for synonyms and properties @@ -187,6 +190,7 @@ somewhere. return properties def parse_searchrequest(self, response): + """Parse the initial response of the ChemSpider Search API """ sel = Selector(response) log.msg('chemspider parse_searchrequest', level=log.DEBUG) sel.register_namespace('cs', 'http://www.chemspider.com/') From 7a1e99605b45b52d3aff77ce932a2b84ee84af8f Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Tue, 22 Apr 2014 18:40:14 +0200 Subject: [PATCH 48/52] Uniform TODO tags, indentation faults. --- FourmiCrawler/parsers/ChemSpider.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/FourmiCrawler/parsers/ChemSpider.py b/FourmiCrawler/parsers/ChemSpider.py index 2904ca2..35bc1f3 100644 --- a/FourmiCrawler/parsers/ChemSpider.py +++ b/FourmiCrawler/parsers/ChemSpider.py @@ -5,8 +5,7 @@ from scrapy.selector import Selector from FourmiCrawler.items import Result import re -# TODO: Maybe clean up usage of '.extract()[0]', because it will raise an -# IndexError exception if the xpath matches nothing +# [TODO] - Maybe clean up usage of '.extract()[0]', because it will raise an IndexError exception if the xpath matches nothing. class ChemSpider(Parser): """ChemSpider scraper for synonyms and properties @@ -18,6 +17,7 @@ somewhere. """ website = 'http://www.chemspider.com/*' + # [TODO] - Save and access token of specific user. search = ('Search.asmx/SimpleSearch?query=%s&token=' '052bfd06-5ce4-43d6-bf12-89eabefd2338') structure = 'Chemical-Structure.%s.html' @@ -42,13 +42,13 @@ somewhere. properties = [] # Predicted - ACD/Labs tab - # TODO: test if tab contains data, some chemicals do not have data here + # [TODO] - test if tab contains data, some chemicals do not have data here td_list = sel.xpath('.//table[@id="acdlabs-table"]//td').xpath( 'normalize-space(string())') prop_names = td_list[::2] prop_values = td_list[1::2] for (prop_name, prop_value) in zip(prop_names, prop_values): - # [:-1] is to remove the colon at the end, TODO: test for colon + # [:-1] is to remove the colon at the end, [TODO] - test for colon prop_name = prop_name.extract().encode('utf-8')[:-1] prop_value = prop_value.extract().encode('utf-8') prop_conditions = '' @@ -61,7 +61,7 @@ somewhere. # Match for condition in value seperated by an 'at' m = re.match(r'(.*) at (.*)', prop_value) - if m: + if m: prop_value = m.group(1) prop_conditions = m.group(2) @@ -105,7 +105,7 @@ somewhere. return properties def parse_synonyms(self, sel): - """Scrape list of Names and Identifiers""" + """Scrape list of Names and Identifiers""" requests = [] synonyms = [] @@ -128,7 +128,7 @@ somewhere. name = syn.xpath('span[@class=""]/text()').extract()[0] synonyms.append(self.new_synonym(syn, name, 'nonvalidated')) - # TODO: confirm if English User-Validated synonyms are OK too + # [TODO] - confirm if English User-Validated synonyms are OK too for syn in synonyms: if (syn['category'] == 'expert' and syn['language'] == 'English'): log.msg('CS emit synonym: %s' % syn['name'], level=log.DEBUG) @@ -144,7 +144,7 @@ somewhere. # The [1:-1] is to remove brackets around the language name language = language.extract()[0][1:-1] else: - # If language is not given, English is assumed, TODO: confirm + # If language is not given, English is assumed, [TODO] - confirm language = 'English' log.msg('CS synonym: %s (%s) (%s)' % (name, category, language), level=log.DEBUG) @@ -195,7 +195,7 @@ somewhere. log.msg('chemspider parse_searchrequest', level=log.DEBUG) sel.register_namespace('cs', 'http://www.chemspider.com/') csid = sel.xpath('.//cs:int/text()').extract()[0] - #TODO: handle multiple csids in case of vague search term + # [TODO] - handle multiple csids in case of vague search term structure_url = self.website[:-1] + self.structure % csid extendedinfo_url = self.website[:-1] + self.extendedinfo % csid log.msg('chemspider URL: %s' % structure_url, level=log.DEBUG) @@ -203,10 +203,10 @@ somewhere. callback=self.parse), Request(url=extendedinfo_url, callback=self.parse_extendedinfo)] - + def new_compound_request(self,compound): - if compound in self.ignore_list: #TODO: add regular expression + if compound in self.ignore_list: #[TODO] - add regular expression return None searchurl = self.website[:-1] + self.search % compound log.msg('chemspider compound', level=log.DEBUG) - return Request(url=searchurl, callback=self.parse_searchrequest) + return Request(url=searchurl, callback=self.parse_searchrequest) \ No newline at end of file From 0da2d74e2cd84b73c40fbb8cf3a209364f3d28af Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Tue, 22 Apr 2014 18:46:49 +0200 Subject: [PATCH 49/52] PEP-8 indentation for multi-line statements --- FourmiCrawler/parsers/ChemSpider.py | 56 ++++++++++++++--------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/FourmiCrawler/parsers/ChemSpider.py b/FourmiCrawler/parsers/ChemSpider.py index 35bc1f3..26bca3d 100644 --- a/FourmiCrawler/parsers/ChemSpider.py +++ b/FourmiCrawler/parsers/ChemSpider.py @@ -8,21 +8,21 @@ import re # [TODO] - Maybe clean up usage of '.extract()[0]', because it will raise an IndexError exception if the xpath matches nothing. class ChemSpider(Parser): -"""ChemSpider scraper for synonyms and properties + """ChemSpider scraper for synonyms and properties -This parser will manage searching for chemicals through the -ChemsSpider API, and parsing the resulting ChemSpider page. -The token required for the API should be in a configuration file -somewhere. -""" + This parser will manage searching for chemicals through the + ChemsSpider API, and parsing the resulting ChemSpider page. + The token required for the API should be in a configuration file + somewhere. + """ website = 'http://www.chemspider.com/*' # [TODO] - Save and access token of specific user. search = ('Search.asmx/SimpleSearch?query=%s&token=' - '052bfd06-5ce4-43d6-bf12-89eabefd2338') + '052bfd06-5ce4-43d6-bf12-89eabefd2338') structure = 'Chemical-Structure.%s.html' extendedinfo = ('MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token=' - '052bfd06-5ce4-43d6-bf12-89eabefd2338') + '052bfd06-5ce4-43d6-bf12-89eabefd2338') ignore_list = [] @@ -44,7 +44,7 @@ somewhere. # Predicted - ACD/Labs tab # [TODO] - test if tab contains data, some chemicals do not have data here td_list = sel.xpath('.//table[@id="acdlabs-table"]//td').xpath( - 'normalize-space(string())') + 'normalize-space(string())') prop_names = td_list[::2] prop_values = td_list[1::2] for (prop_name, prop_value) in zip(prop_names, prop_values): @@ -71,20 +71,20 @@ somewhere. 'source': 'ChemSpider Predicted - ACD/Labs Tab', 'reliability': 'Unknown', 'conditions': prop_conditions - }) + }) properties.append(new_prop) log.msg('CS prop: |%s| |%s| |%s|' % - (new_prop['attribute'], new_prop['value'], new_prop['source']), - level=log.DEBUG) + (new_prop['attribute'], new_prop['value'], new_prop['source']), + level=log.DEBUG) # Experimental Data Tab, Physico-chemical properties in particular scraped_list = sel.xpath('.//li[span="Experimental Physico-chemical ' - 'Properties"]//li/table/tr/td') + 'Properties"]//li/table/tr/td') if not scraped_list: return properties # Format is: property name followed by a list of values property_name = scraped_list.pop(0).xpath( - 'span/text()').extract()[0].rstrip() + 'span/text()').extract()[0].rstrip() for line in scraped_list: if line.xpath('span/text()'): property_name = line.xpath('span/text()').extract()[0].rstrip() @@ -93,14 +93,14 @@ somewhere. 'attribute': property_name[:-1], 'value': line.xpath('text()').extract()[0].rstrip(), 'source': line.xpath( - 'strong/text()').extract()[0].rstrip(), + 'strong/text()').extract()[0].rstrip(), 'reliability': 'Unknown', 'conditions': '' - }) + }) properties.append(new_prop) log.msg('CS prop: |%s| |%s| |%s|' % - (new_prop['attribute'], new_prop['value'], - new_prop['source']), level=log.DEBUG) + (new_prop['attribute'], new_prop['value'], + new_prop['source']), level=log.DEBUG) return properties @@ -119,9 +119,9 @@ somewhere. synonyms.append(self.new_synonym(syn, name, 'expert')) # These synonyms are labeled by ChemSpider as "Validated by Users" for syn in sel.xpath( - '//p[@class="syn"][span[@class="synonym_confirmed"]]'): + '//p[@class="syn"][span[@class="synonym_confirmed"]]'): name = syn.xpath( - 'span[@class="synonym_confirmed"]/text()').extract()[0] + 'span[@class="synonym_confirmed"]/text()').extract()[0] synonyms.append(self.new_synonym(syn, name, 'user')) # These syonyms are labeled as "Non-validated" and assumed unreliable for syn in sel.xpath('//p[@class="syn"][span[@class=""]]'): @@ -155,12 +155,12 @@ somewhere. references.append({ 'name': refname.extract()[0][1:-1], 'URI': '' - }) + }) for ref in sel.xpath('a[@class="synonym_ref"]'): references.append({ 'name': ref.xpath('@title').extract()[0], 'URI': ref.xpath('@href').extract()[0] - }) + }) for ref in references: log.msg('CS synonym ref: %s %s' % (ref['name'], ref['URI']), level=log.DEBUG) @@ -169,7 +169,7 @@ somewhere. 'category': category, 'language': language, 'references': references - } + } return synonym def parse_extendedinfo(self, response): @@ -178,14 +178,14 @@ somewhere. properties = [] names = sel.xpath('*').xpath('name()').extract() values = sel.xpath('*').xpath('text()').extract() - for (name, value) in zip(names,values): + for (name, value) in zip(names, values): result = Result({ 'attribute': name, - 'value': value, #These values have no unit! + 'value': value, #These values have no unit! 'source': 'ChemSpider ExtendedCompoundInfo', 'reliability': 'Unknown', 'conditions': '' - }) + }) properties.append(result) return properties @@ -204,8 +204,8 @@ somewhere. Request(url=extendedinfo_url, callback=self.parse_extendedinfo)] - def new_compound_request(self,compound): - if compound in self.ignore_list: #[TODO] - add regular expression + def new_compound_request(self, compound): + if compound in self.ignore_list: #[TODO] - add regular expression return None searchurl = self.website[:-1] + self.search % compound log.msg('chemspider compound', level=log.DEBUG) From 648b23e466e76fa699a2fd2ff3e6f7807e3d84c0 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Tue, 22 Apr 2014 18:54:10 +0200 Subject: [PATCH 50/52] PEP-8 standards for a lot of things --- FourmiCrawler/parsers/ChemSpider.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/FourmiCrawler/parsers/ChemSpider.py b/FourmiCrawler/parsers/ChemSpider.py index 26bca3d..a3288f3 100644 --- a/FourmiCrawler/parsers/ChemSpider.py +++ b/FourmiCrawler/parsers/ChemSpider.py @@ -5,7 +5,8 @@ from scrapy.selector import Selector from FourmiCrawler.items import Result import re -# [TODO] - Maybe clean up usage of '.extract()[0]', because it will raise an IndexError exception if the xpath matches nothing. +# [TODO] - Maybe clean up usage of '.extract()[0]', because of possible IndexError exception. + class ChemSpider(Parser): """ChemSpider scraper for synonyms and properties @@ -15,6 +16,10 @@ class ChemSpider(Parser): The token required for the API should be in a configuration file somewhere. """ + + def __init__(self): + pass + website = 'http://www.chemspider.com/*' # [TODO] - Save and access token of specific user. @@ -36,9 +41,9 @@ class ChemSpider(Parser): return requests - def parse_properties(self, sel): + @staticmethod + def parse_properties(sel): """scrape Experimental Data and Predicted ACD/Labs tabs""" - requests = [] properties = [] # Predicted - ACD/Labs tab @@ -130,7 +135,7 @@ class ChemSpider(Parser): # [TODO] - confirm if English User-Validated synonyms are OK too for syn in synonyms: - if (syn['category'] == 'expert' and syn['language'] == 'English'): + if syn['category'] == 'expert' and syn['language'] == 'English': log.msg('CS emit synonym: %s' % syn['name'], level=log.DEBUG) self._Parser__spider.get_synonym_requests(syn['name']) @@ -172,7 +177,8 @@ class ChemSpider(Parser): } return synonym - def parse_extendedinfo(self, response): + @staticmethod + def parse_extendedinfo(response): """Scrape data from the ChemSpider GetExtendedCompoundInfo API""" sel = Selector(response) properties = [] @@ -181,7 +187,7 @@ class ChemSpider(Parser): for (name, value) in zip(names, values): result = Result({ 'attribute': name, - 'value': value, #These values have no unit! + 'value': value, # These values have no unit! 'source': 'ChemSpider ExtendedCompoundInfo', 'reliability': 'Unknown', 'conditions': '' @@ -205,7 +211,7 @@ class ChemSpider(Parser): callback=self.parse_extendedinfo)] def new_compound_request(self, compound): - if compound in self.ignore_list: #[TODO] - add regular expression + if compound in self.ignore_list: # [TODO] - add regular expression return None searchurl = self.website[:-1] + self.search % compound log.msg('chemspider compound', level=log.DEBUG) From ba7bed02504f24209b7223e3701efdafad6e0bad Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Tue, 22 Apr 2014 18:55:14 +0200 Subject: [PATCH 51/52] Disabled name mangling for the spider reference in the parsers --- FourmiCrawler/parsers/ChemSpider.py | 2 +- FourmiCrawler/parsers/parser.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/FourmiCrawler/parsers/ChemSpider.py b/FourmiCrawler/parsers/ChemSpider.py index a3288f3..3273107 100644 --- a/FourmiCrawler/parsers/ChemSpider.py +++ b/FourmiCrawler/parsers/ChemSpider.py @@ -137,7 +137,7 @@ class ChemSpider(Parser): for syn in synonyms: if syn['category'] == 'expert' and syn['language'] == 'English': log.msg('CS emit synonym: %s' % syn['name'], level=log.DEBUG) - self._Parser__spider.get_synonym_requests(syn['name']) + self._spider.get_synonym_requests(syn['name']) return requests diff --git a/FourmiCrawler/parsers/parser.py b/FourmiCrawler/parsers/parser.py index cac5019..0766bd5 100644 --- a/FourmiCrawler/parsers/parser.py +++ b/FourmiCrawler/parsers/parser.py @@ -7,7 +7,7 @@ class Parser: website should be an regular expression of the urls of request the parser is able to parse. ''' website = "http://something/*" - __spider = None + _spider = None def parse(self, reponse): log.msg("The parse function of the empty parser was used.", level=log.WARNING) From 595af7aa327ce959f646d9c6cdbb21e816d38f57 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Tue, 22 Apr 2014 19:03:29 +0200 Subject: [PATCH 52/52] PEP-8 and fixed a bug in set_spider --- FourmiCrawler/parsers/parser.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/FourmiCrawler/parsers/parser.py b/FourmiCrawler/parsers/parser.py index 0766bd5..feb4535 100644 --- a/FourmiCrawler/parsers/parser.py +++ b/FourmiCrawler/parsers/parser.py @@ -3,12 +3,12 @@ from scrapy import log class Parser: - ''' - website should be an regular expression of the urls of request the parser is able to parse. - ''' - website = "http://something/*" + website = "http://something/*" # Regex of URI's the source is able to parse _spider = None + def __init__(self): + pass + def parse(self, reponse): log.msg("The parse function of the empty parser was used.", level=log.WARNING) pass @@ -18,4 +18,4 @@ class Parser: pass def set_spider(self, spider): - self.__spider = spider + self._spider = spider