Code inspection
This commit is contained in:
parent
87275a6dc8
commit
22ca4afa33
@ -89,7 +89,7 @@ class ChemSpider(Source):
|
||||
|
||||
# Test for properties without values, with one hardcoded exception
|
||||
if (not re.match(r'^\d', prop_value) or
|
||||
(prop_name == 'Polarizability' and prop_value == '10-24cm3')):
|
||||
(prop_name == 'Polarizability' and prop_value == '10-24cm3')):
|
||||
continue
|
||||
|
||||
m = re.match(r'(.*) \((.*)\)', prop_name)
|
||||
@ -122,12 +122,12 @@ class ChemSpider(Source):
|
||||
properties = []
|
||||
|
||||
scraped_list = sel.xpath('.//li[span="Experimental Physico-chemical '
|
||||
'Properties"]//li/table/tr/td')
|
||||
'Properties"]//li/table/tr/td')
|
||||
if not scraped_list:
|
||||
return properties
|
||||
# Format is: property name followed by a list of values
|
||||
property_name = scraped_list.pop(0).xpath(
|
||||
'span/text()').extract()[0].rstrip()
|
||||
'span/text()').extract()[0].rstrip()
|
||||
for line in scraped_list:
|
||||
if line.xpath('span/text()'):
|
||||
property_name = line.xpath('span/text()').extract()[0].rstrip()
|
||||
@ -251,12 +251,12 @@ class ChemSpider(Source):
|
||||
:return: A Result item
|
||||
"""
|
||||
return Result({
|
||||
'attribute': attribute,
|
||||
'value': value,
|
||||
'source': source,
|
||||
'reliability': self.cfg['reliability'],
|
||||
'conditions': conditions
|
||||
})
|
||||
'attribute': attribute,
|
||||
'value': value,
|
||||
'source': source,
|
||||
'reliability': self.cfg['reliability'],
|
||||
'conditions': conditions
|
||||
})
|
||||
|
||||
def parse_searchrequest(self, response):
|
||||
"""
|
||||
|
@ -313,12 +313,13 @@ class NIST(Source):
|
||||
:param conditions: optional conditions regarding the value
|
||||
:return: A Result item
|
||||
"""
|
||||
return Result({
|
||||
'attribute': attribute,
|
||||
'value': value,
|
||||
'source': 'NIST',
|
||||
'reliability': self.cfg['reliability'],
|
||||
'conditions': conditions
|
||||
return Result(
|
||||
{
|
||||
'attribute': attribute,
|
||||
'value': value,
|
||||
'source': 'NIST',
|
||||
'reliability': self.cfg['reliability'],
|
||||
'conditions': conditions
|
||||
})
|
||||
|
||||
def new_compound_request(self, compound):
|
||||
|
@ -15,7 +15,7 @@ class PubChem(Source):
|
||||
including sources of the values of properties.
|
||||
"""
|
||||
|
||||
#PubChem has its data on compound name, properties and their values on different html pages, so different URLs used
|
||||
# PubChem has its data on compound name, properties and their values on different html pages, so different URLs used
|
||||
website = 'http://.*\\.ncbi\\.nlm\\.nih\\.gov/.*'
|
||||
website_www = 'http://www.ncbi.nlm.nih.gov/*'
|
||||
website_pubchem = 'http://pubchem.ncbi.nlm.nih.gov/.*'
|
||||
@ -54,14 +54,16 @@ class PubChem(Source):
|
||||
n = re.search(r'cid=(\d+)', response.url)
|
||||
if n:
|
||||
cid = n.group(1)
|
||||
log.msg('cid: %s' % cid, level=log.DEBUG) #getting the right id of the compound with which it can reach
|
||||
# the seperate html page which contains the properties and their values
|
||||
log.msg('cid: %s' % cid, level=log.DEBUG) # getting the right id of the compound with which it can reach
|
||||
# the seperate html page which contains the properties and their values
|
||||
|
||||
#using this cid to get the right url and scrape it
|
||||
requests.append(Request(url=self.website_pubchem[:-2].replace("\\","") + self.data_url % cid, callback=self.parse_data))
|
||||
# using this cid to get the right url and scrape it
|
||||
requests.append(
|
||||
Request(url=self.website_pubchem[:-2].replace("\\", "") + self.data_url % cid, callback=self.parse_data))
|
||||
return requests
|
||||
|
||||
def parse_data(self, response):
|
||||
@staticmethod
|
||||
def parse_data(response):
|
||||
"""
|
||||
Parse data found in 'Chemical and Physical properties' part of a substance page.
|
||||
:param response: The response with the page to parse
|
||||
@ -74,8 +76,8 @@ class PubChem(Source):
|
||||
props = sel.xpath('//div')
|
||||
|
||||
for prop in props:
|
||||
prop_name = ''.join(prop.xpath('b/text()').extract()) # name of property that it is parsing
|
||||
if prop.xpath('a'): # parsing for single value in property
|
||||
prop_name = ''.join(prop.xpath('b/text()').extract()) # name of property that it is parsing
|
||||
if prop.xpath('a'): # parsing for single value in property
|
||||
prop_source = ''.join(prop.xpath('a/@title').extract())
|
||||
prop_value = ''.join(prop.xpath('a/text()').extract())
|
||||
new_prop = Result({
|
||||
@ -89,7 +91,7 @@ class PubChem(Source):
|
||||
(new_prop['attribute'], new_prop['value'],
|
||||
new_prop['source']), level=log.DEBUG)
|
||||
requests.append(new_prop)
|
||||
elif prop.xpath('ul'): # parsing for multiple values (list) in property
|
||||
elif prop.xpath('ul'): # parsing for multiple values (list) in property
|
||||
prop_values = prop.xpath('ul//li')
|
||||
for prop_li in prop_values:
|
||||
prop_value = ''.join(prop_li.xpath('a/text()').extract())
|
||||
@ -102,8 +104,8 @@ class PubChem(Source):
|
||||
'conditions': ''
|
||||
})
|
||||
log.msg('PubChem prop: |%s| |%s| |%s|' %
|
||||
(new_prop['attribute'], new_prop['value'],
|
||||
new_prop['source']), level=log.DEBUG)
|
||||
(new_prop['attribute'], new_prop['value'],
|
||||
new_prop['source']), level=log.DEBUG)
|
||||
requests.append(new_prop)
|
||||
|
||||
return requests
|
||||
@ -116,7 +118,7 @@ class PubChem(Source):
|
||||
case the search request forwarded to the compound page
|
||||
"""
|
||||
|
||||
#check if pubchem forwarded straight to compound page
|
||||
# check if pubchem forwarded straight to compound page
|
||||
m = re.match(self.website_pubchem, response.url)
|
||||
if m:
|
||||
log.msg('PubChem search forwarded to compound page',
|
||||
|
@ -63,7 +63,7 @@ def search(docopt_arguments, source_loader):
|
||||
source_loader, docopt_arguments["--attributes"].split(','))
|
||||
if conf.scrapy_settings.getbool("LOG_ENABLED"):
|
||||
log.start(conf.scrapy_settings.get("LOG_FILE"),
|
||||
conf.scrapy_settings.get("LOG_LEVEL"), conf.scrapy_settings.get("LOG_STDOUT"))
|
||||
conf.scrapy_settings.get("LOG_LEVEL"), conf.scrapy_settings.get("LOG_STDOUT"))
|
||||
reactor.run()
|
||||
|
||||
|
||||
|
Reference in New Issue
Block a user