Merge branch 'feature/compound-name' into develop
This commit is contained in:
commit
76bf634ad9
@ -89,7 +89,7 @@ class ChemSpider(Source):
|
|||||||
|
|
||||||
# Test for properties without values, with one hardcoded exception
|
# Test for properties without values, with one hardcoded exception
|
||||||
if (not re.match(r'^\d', prop_value) or
|
if (not re.match(r'^\d', prop_value) or
|
||||||
(prop_name == 'Polarizability' and prop_value == '10-24cm3')):
|
(prop_name == 'Polarizability' and prop_value == '10-24cm3')):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
m = re.match(r'(.*) \((.*)\)', prop_name)
|
m = re.match(r'(.*) \((.*)\)', prop_name)
|
||||||
@ -122,12 +122,12 @@ class ChemSpider(Source):
|
|||||||
properties = []
|
properties = []
|
||||||
|
|
||||||
scraped_list = sel.xpath('.//li[span="Experimental Physico-chemical '
|
scraped_list = sel.xpath('.//li[span="Experimental Physico-chemical '
|
||||||
'Properties"]//li/table/tr/td')
|
'Properties"]//li/table/tr/td')
|
||||||
if not scraped_list:
|
if not scraped_list:
|
||||||
return properties
|
return properties
|
||||||
# Format is: property name followed by a list of values
|
# Format is: property name followed by a list of values
|
||||||
property_name = scraped_list.pop(0).xpath(
|
property_name = scraped_list.pop(0).xpath(
|
||||||
'span/text()').extract()[0].rstrip()
|
'span/text()').extract()[0].rstrip()
|
||||||
for line in scraped_list:
|
for line in scraped_list:
|
||||||
if line.xpath('span/text()'):
|
if line.xpath('span/text()'):
|
||||||
property_name = line.xpath('span/text()').extract()[0].rstrip()
|
property_name = line.xpath('span/text()').extract()[0].rstrip()
|
||||||
@ -251,12 +251,12 @@ class ChemSpider(Source):
|
|||||||
:return: A Result item
|
:return: A Result item
|
||||||
"""
|
"""
|
||||||
return Result({
|
return Result({
|
||||||
'attribute': attribute,
|
'attribute': attribute,
|
||||||
'value': value,
|
'value': value,
|
||||||
'source': source,
|
'source': source,
|
||||||
'reliability': self.cfg['reliability'],
|
'reliability': self.cfg['reliability'],
|
||||||
'conditions': conditions
|
'conditions': conditions
|
||||||
})
|
})
|
||||||
|
|
||||||
def parse_searchrequest(self, response):
|
def parse_searchrequest(self, response):
|
||||||
"""
|
"""
|
||||||
|
@ -313,12 +313,13 @@ class NIST(Source):
|
|||||||
:param conditions: optional conditions regarding the value
|
:param conditions: optional conditions regarding the value
|
||||||
:return: A Result item
|
:return: A Result item
|
||||||
"""
|
"""
|
||||||
return Result({
|
return Result(
|
||||||
'attribute': attribute,
|
{
|
||||||
'value': value,
|
'attribute': attribute,
|
||||||
'source': 'NIST',
|
'value': value,
|
||||||
'reliability': self.cfg['reliability'],
|
'source': 'NIST',
|
||||||
'conditions': conditions
|
'reliability': self.cfg['reliability'],
|
||||||
|
'conditions': conditions
|
||||||
})
|
})
|
||||||
|
|
||||||
def new_compound_request(self, compound):
|
def new_compound_request(self, compound):
|
||||||
|
@ -15,7 +15,7 @@ class PubChem(Source):
|
|||||||
including sources of the values of properties.
|
including sources of the values of properties.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
#PubChem has its data on compound name, properties and their values on different html pages, so different URLs used
|
# PubChem has its data on compound name, properties and their values on different html pages, so different URLs used
|
||||||
website = 'http://.*\\.ncbi\\.nlm\\.nih\\.gov/.*'
|
website = 'http://.*\\.ncbi\\.nlm\\.nih\\.gov/.*'
|
||||||
website_www = 'http://www.ncbi.nlm.nih.gov/*'
|
website_www = 'http://www.ncbi.nlm.nih.gov/*'
|
||||||
website_pubchem = 'http://pubchem.ncbi.nlm.nih.gov/.*'
|
website_pubchem = 'http://pubchem.ncbi.nlm.nih.gov/.*'
|
||||||
@ -54,14 +54,16 @@ class PubChem(Source):
|
|||||||
n = re.search(r'cid=(\d+)', response.url)
|
n = re.search(r'cid=(\d+)', response.url)
|
||||||
if n:
|
if n:
|
||||||
cid = n.group(1)
|
cid = n.group(1)
|
||||||
log.msg('cid: %s' % cid, level=log.DEBUG) #getting the right id of the compound with which it can reach
|
log.msg('cid: %s' % cid, level=log.DEBUG) # getting the right id of the compound with which it can reach
|
||||||
# the seperate html page which contains the properties and their values
|
# the seperate html page which contains the properties and their values
|
||||||
|
|
||||||
#using this cid to get the right url and scrape it
|
# using this cid to get the right url and scrape it
|
||||||
requests.append(Request(url=self.website_pubchem[:-2].replace("\\","") + self.data_url % cid, callback=self.parse_data))
|
requests.append(
|
||||||
|
Request(url=self.website_pubchem[:-2].replace("\\", "") + self.data_url % cid, callback=self.parse_data))
|
||||||
return requests
|
return requests
|
||||||
|
|
||||||
def parse_data(self, response):
|
@staticmethod
|
||||||
|
def parse_data(response):
|
||||||
"""
|
"""
|
||||||
Parse data found in 'Chemical and Physical properties' part of a substance page.
|
Parse data found in 'Chemical and Physical properties' part of a substance page.
|
||||||
:param response: The response with the page to parse
|
:param response: The response with the page to parse
|
||||||
@ -74,8 +76,8 @@ class PubChem(Source):
|
|||||||
props = sel.xpath('//div')
|
props = sel.xpath('//div')
|
||||||
|
|
||||||
for prop in props:
|
for prop in props:
|
||||||
prop_name = ''.join(prop.xpath('b/text()').extract()) # name of property that it is parsing
|
prop_name = ''.join(prop.xpath('b/text()').extract()) # name of property that it is parsing
|
||||||
if prop.xpath('a'): # parsing for single value in property
|
if prop.xpath('a'): # parsing for single value in property
|
||||||
prop_source = ''.join(prop.xpath('a/@title').extract())
|
prop_source = ''.join(prop.xpath('a/@title').extract())
|
||||||
prop_value = ''.join(prop.xpath('a/text()').extract())
|
prop_value = ''.join(prop.xpath('a/text()').extract())
|
||||||
new_prop = Result({
|
new_prop = Result({
|
||||||
@ -89,7 +91,7 @@ class PubChem(Source):
|
|||||||
(new_prop['attribute'], new_prop['value'],
|
(new_prop['attribute'], new_prop['value'],
|
||||||
new_prop['source']), level=log.DEBUG)
|
new_prop['source']), level=log.DEBUG)
|
||||||
requests.append(new_prop)
|
requests.append(new_prop)
|
||||||
elif prop.xpath('ul'): # parsing for multiple values (list) in property
|
elif prop.xpath('ul'): # parsing for multiple values (list) in property
|
||||||
prop_values = prop.xpath('ul//li')
|
prop_values = prop.xpath('ul//li')
|
||||||
for prop_li in prop_values:
|
for prop_li in prop_values:
|
||||||
prop_value = ''.join(prop_li.xpath('a/text()').extract())
|
prop_value = ''.join(prop_li.xpath('a/text()').extract())
|
||||||
@ -102,8 +104,8 @@ class PubChem(Source):
|
|||||||
'conditions': ''
|
'conditions': ''
|
||||||
})
|
})
|
||||||
log.msg('PubChem prop: |%s| |%s| |%s|' %
|
log.msg('PubChem prop: |%s| |%s| |%s|' %
|
||||||
(new_prop['attribute'], new_prop['value'],
|
(new_prop['attribute'], new_prop['value'],
|
||||||
new_prop['source']), level=log.DEBUG)
|
new_prop['source']), level=log.DEBUG)
|
||||||
requests.append(new_prop)
|
requests.append(new_prop)
|
||||||
|
|
||||||
return requests
|
return requests
|
||||||
@ -116,7 +118,7 @@ class PubChem(Source):
|
|||||||
case the search request forwarded to the compound page
|
case the search request forwarded to the compound page
|
||||||
"""
|
"""
|
||||||
|
|
||||||
#check if pubchem forwarded straight to compound page
|
# check if pubchem forwarded straight to compound page
|
||||||
m = re.match(self.website_pubchem, response.url)
|
m = re.match(self.website_pubchem, response.url)
|
||||||
if m:
|
if m:
|
||||||
log.msg('PubChem search forwarded to compound page',
|
log.msg('PubChem search forwarded to compound page',
|
||||||
|
@ -17,7 +17,7 @@ Options:
|
|||||||
--version Show version.
|
--version Show version.
|
||||||
-v Verbose logging output. (Multiple occurrences increase logging level)
|
-v Verbose logging output. (Multiple occurrences increase logging level)
|
||||||
--log=<file> Save log to an file.
|
--log=<file> Save log to an file.
|
||||||
-o <file> --output=<file> Output file [default: results.*format*]
|
-o <file> --output=<file> Output file [default: <compound>.*format*]
|
||||||
-f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: csv]
|
-f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: csv]
|
||||||
--include=<regex> Include only sources that match these regular expressions split by a comma.
|
--include=<regex> Include only sources that match these regular expressions split by a comma.
|
||||||
--exclude=<regex> Exclude the sources that match these regular expressions split by a comma.
|
--exclude=<regex> Exclude the sources that match these regular expressions split by a comma.
|
||||||
@ -58,12 +58,12 @@ def search(docopt_arguments, source_loader):
|
|||||||
"""
|
"""
|
||||||
conf = Configurator()
|
conf = Configurator()
|
||||||
conf.set_logging(docopt_arguments["--log"], docopt_arguments["-v"])
|
conf.set_logging(docopt_arguments["--log"], docopt_arguments["-v"])
|
||||||
conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"])
|
conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"], docopt_arguments["<compound>"])
|
||||||
setup_crawler(docopt_arguments["<compound>"], conf.scrapy_settings,
|
setup_crawler(docopt_arguments["<compound>"], conf.scrapy_settings,
|
||||||
source_loader, docopt_arguments["--attributes"].split(','))
|
source_loader, docopt_arguments["--attributes"].split(','))
|
||||||
if conf.scrapy_settings.getbool("LOG_ENABLED"):
|
if conf.scrapy_settings.getbool("LOG_ENABLED"):
|
||||||
log.start(conf.scrapy_settings.get("LOG_FILE"),
|
log.start(conf.scrapy_settings.get("LOG_FILE"),
|
||||||
conf.scrapy_settings.get("LOG_LEVEL"), conf.scrapy_settings.get("LOG_STDOUT"))
|
conf.scrapy_settings.get("LOG_LEVEL"), conf.scrapy_settings.get("LOG_STDOUT"))
|
||||||
reactor.run()
|
reactor.run()
|
||||||
|
|
||||||
|
|
||||||
|
@ -10,16 +10,16 @@ class TestConfigurator(unittest.TestCase):
|
|||||||
self.conf = Configurator()
|
self.conf = Configurator()
|
||||||
|
|
||||||
def test_set_output(self):
|
def test_set_output(self):
|
||||||
self.conf.set_output(filename="test.txt", fileformat="csv")
|
self.conf.set_output(filename="test.txt", fileformat="csv", compound="test")
|
||||||
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.txt")
|
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.txt")
|
||||||
self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv")
|
self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv")
|
||||||
|
|
||||||
self.conf.set_output("results.*format*", "jsonlines")
|
self.conf.set_output("<compound>.*format*", "jsonlines", "test")
|
||||||
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.json")
|
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.json")
|
||||||
self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "jsonlines")
|
self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "jsonlines")
|
||||||
|
|
||||||
self.conf.set_output("results.*format*", "csv")
|
self.conf.set_output("results.*format*", "csv", "test")
|
||||||
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.csv")
|
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.csv")
|
||||||
self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv")
|
self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv")
|
||||||
|
|
||||||
def test_start_log(self):
|
def test_start_log(self):
|
||||||
|
@ -3,6 +3,7 @@ import os
|
|||||||
|
|
||||||
from scrapy.utils.project import get_project_settings
|
from scrapy.utils.project import get_project_settings
|
||||||
|
|
||||||
|
|
||||||
class Configurator:
|
class Configurator:
|
||||||
"""
|
"""
|
||||||
A helper class in the fourmi class. This class is used to process the settings as set
|
A helper class in the fourmi class. This class is used to process the settings as set
|
||||||
@ -12,7 +13,7 @@ class Configurator:
|
|||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.scrapy_settings = get_project_settings()
|
self.scrapy_settings = get_project_settings()
|
||||||
|
|
||||||
def set_output(self, filename, fileformat):
|
def set_output(self, filename, fileformat, compound):
|
||||||
"""
|
"""
|
||||||
This function manipulates the Scrapy output file settings that normally would be set in the settings file.
|
This function manipulates the Scrapy output file settings that normally would be set in the settings file.
|
||||||
In the Fourmi project these are command line arguments.
|
In the Fourmi project these are command line arguments.
|
||||||
@ -20,12 +21,12 @@ class Configurator:
|
|||||||
:param fileformat: The format in which the output will be.
|
:param fileformat: The format in which the output will be.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if filename != 'results.*format*':
|
if filename != '<compound>.*format*':
|
||||||
self.scrapy_settings.overrides["FEED_URI"] = filename
|
self.scrapy_settings.overrides["FEED_URI"] = filename
|
||||||
elif fileformat == "jsonlines":
|
elif fileformat == "jsonlines":
|
||||||
self.scrapy_settings.overrides["FEED_URI"] = "results.json"
|
self.scrapy_settings.overrides["FEED_URI"] = compound + ".json"
|
||||||
elif fileformat is not None:
|
elif fileformat is not None:
|
||||||
self.scrapy_settings.overrides["FEED_URI"] = "results." + fileformat
|
self.scrapy_settings.overrides["FEED_URI"] = compound + "." + fileformat
|
||||||
|
|
||||||
if fileformat is not None:
|
if fileformat is not None:
|
||||||
self.scrapy_settings.overrides["FEED_FORMAT"] = fileformat
|
self.scrapy_settings.overrides["FEED_FORMAT"] = fileformat
|
||||||
|
Reference in New Issue
Block a user