From 22ca4afa33058781330fd125b61e23281dcb0c4d Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Fri, 20 Jun 2014 11:21:26 +0200 Subject: [PATCH] Code inspection --- FourmiCrawler/sources/ChemSpider.py | 18 +++++++++--------- FourmiCrawler/sources/NIST.py | 13 +++++++------ FourmiCrawler/sources/PubChem.py | 26 ++++++++++++++------------ fourmi.py | 2 +- 4 files changed, 31 insertions(+), 28 deletions(-) diff --git a/FourmiCrawler/sources/ChemSpider.py b/FourmiCrawler/sources/ChemSpider.py index b4bf6f0..e95d067 100644 --- a/FourmiCrawler/sources/ChemSpider.py +++ b/FourmiCrawler/sources/ChemSpider.py @@ -89,7 +89,7 @@ class ChemSpider(Source): # Test for properties without values, with one hardcoded exception if (not re.match(r'^\d', prop_value) or - (prop_name == 'Polarizability' and prop_value == '10-24cm3')): + (prop_name == 'Polarizability' and prop_value == '10-24cm3')): continue m = re.match(r'(.*) \((.*)\)', prop_name) @@ -122,12 +122,12 @@ class ChemSpider(Source): properties = [] scraped_list = sel.xpath('.//li[span="Experimental Physico-chemical ' - 'Properties"]//li/table/tr/td') + 'Properties"]//li/table/tr/td') if not scraped_list: return properties # Format is: property name followed by a list of values property_name = scraped_list.pop(0).xpath( - 'span/text()').extract()[0].rstrip() + 'span/text()').extract()[0].rstrip() for line in scraped_list: if line.xpath('span/text()'): property_name = line.xpath('span/text()').extract()[0].rstrip() @@ -251,12 +251,12 @@ class ChemSpider(Source): :return: A Result item """ return Result({ - 'attribute': attribute, - 'value': value, - 'source': source, - 'reliability': self.cfg['reliability'], - 'conditions': conditions - }) + 'attribute': attribute, + 'value': value, + 'source': source, + 'reliability': self.cfg['reliability'], + 'conditions': conditions + }) def parse_searchrequest(self, response): """ diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py index 691b062..52f1332 100644 --- a/FourmiCrawler/sources/NIST.py +++ b/FourmiCrawler/sources/NIST.py @@ -313,12 +313,13 @@ class NIST(Source): :param conditions: optional conditions regarding the value :return: A Result item """ - return Result({ - 'attribute': attribute, - 'value': value, - 'source': 'NIST', - 'reliability': self.cfg['reliability'], - 'conditions': conditions + return Result( + { + 'attribute': attribute, + 'value': value, + 'source': 'NIST', + 'reliability': self.cfg['reliability'], + 'conditions': conditions }) def new_compound_request(self, compound): diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py index 15fa3f9..4cd5304 100644 --- a/FourmiCrawler/sources/PubChem.py +++ b/FourmiCrawler/sources/PubChem.py @@ -15,7 +15,7 @@ class PubChem(Source): including sources of the values of properties. """ - #PubChem has its data on compound name, properties and their values on different html pages, so different URLs used + # PubChem has its data on compound name, properties and their values on different html pages, so different URLs used website = 'http://.*\\.ncbi\\.nlm\\.nih\\.gov/.*' website_www = 'http://www.ncbi.nlm.nih.gov/*' website_pubchem = 'http://pubchem.ncbi.nlm.nih.gov/.*' @@ -54,14 +54,16 @@ class PubChem(Source): n = re.search(r'cid=(\d+)', response.url) if n: cid = n.group(1) - log.msg('cid: %s' % cid, level=log.DEBUG) #getting the right id of the compound with which it can reach - # the seperate html page which contains the properties and their values + log.msg('cid: %s' % cid, level=log.DEBUG) # getting the right id of the compound with which it can reach + # the seperate html page which contains the properties and their values - #using this cid to get the right url and scrape it - requests.append(Request(url=self.website_pubchem[:-2].replace("\\","") + self.data_url % cid, callback=self.parse_data)) + # using this cid to get the right url and scrape it + requests.append( + Request(url=self.website_pubchem[:-2].replace("\\", "") + self.data_url % cid, callback=self.parse_data)) return requests - def parse_data(self, response): + @staticmethod + def parse_data(response): """ Parse data found in 'Chemical and Physical properties' part of a substance page. :param response: The response with the page to parse @@ -74,8 +76,8 @@ class PubChem(Source): props = sel.xpath('//div') for prop in props: - prop_name = ''.join(prop.xpath('b/text()').extract()) # name of property that it is parsing - if prop.xpath('a'): # parsing for single value in property + prop_name = ''.join(prop.xpath('b/text()').extract()) # name of property that it is parsing + if prop.xpath('a'): # parsing for single value in property prop_source = ''.join(prop.xpath('a/@title').extract()) prop_value = ''.join(prop.xpath('a/text()').extract()) new_prop = Result({ @@ -89,7 +91,7 @@ class PubChem(Source): (new_prop['attribute'], new_prop['value'], new_prop['source']), level=log.DEBUG) requests.append(new_prop) - elif prop.xpath('ul'): # parsing for multiple values (list) in property + elif prop.xpath('ul'): # parsing for multiple values (list) in property prop_values = prop.xpath('ul//li') for prop_li in prop_values: prop_value = ''.join(prop_li.xpath('a/text()').extract()) @@ -102,8 +104,8 @@ class PubChem(Source): 'conditions': '' }) log.msg('PubChem prop: |%s| |%s| |%s|' % - (new_prop['attribute'], new_prop['value'], - new_prop['source']), level=log.DEBUG) + (new_prop['attribute'], new_prop['value'], + new_prop['source']), level=log.DEBUG) requests.append(new_prop) return requests @@ -116,7 +118,7 @@ class PubChem(Source): case the search request forwarded to the compound page """ - #check if pubchem forwarded straight to compound page + # check if pubchem forwarded straight to compound page m = re.match(self.website_pubchem, response.url) if m: log.msg('PubChem search forwarded to compound page', diff --git a/fourmi.py b/fourmi.py index d6d5fd9..f0caa05 100755 --- a/fourmi.py +++ b/fourmi.py @@ -63,7 +63,7 @@ def search(docopt_arguments, source_loader): source_loader, docopt_arguments["--attributes"].split(',')) if conf.scrapy_settings.getbool("LOG_ENABLED"): log.start(conf.scrapy_settings.get("LOG_FILE"), - conf.scrapy_settings.get("LOG_LEVEL"), conf.scrapy_settings.get("LOG_STDOUT")) + conf.scrapy_settings.get("LOG_LEVEL"), conf.scrapy_settings.get("LOG_STDOUT")) reactor.run()