Archived
1
0

Merge branch 'feature/compound-name' into develop

This commit is contained in:
Jip J. Dekker 2014-06-20 11:21:43 +02:00
commit 76bf634ad9
6 changed files with 43 additions and 39 deletions

View File

@ -313,7 +313,8 @@ class NIST(Source):
:param conditions: optional conditions regarding the value
:return: A Result item
"""
return Result({
return Result(
{
'attribute': attribute,
'value': value,
'source': 'NIST',

View File

@ -15,7 +15,7 @@ class PubChem(Source):
including sources of the values of properties.
"""
#PubChem has its data on compound name, properties and their values on different html pages, so different URLs used
# PubChem has its data on compound name, properties and their values on different html pages, so different URLs used
website = 'http://.*\\.ncbi\\.nlm\\.nih\\.gov/.*'
website_www = 'http://www.ncbi.nlm.nih.gov/*'
website_pubchem = 'http://pubchem.ncbi.nlm.nih.gov/.*'
@ -54,14 +54,16 @@ class PubChem(Source):
n = re.search(r'cid=(\d+)', response.url)
if n:
cid = n.group(1)
log.msg('cid: %s' % cid, level=log.DEBUG) #getting the right id of the compound with which it can reach
log.msg('cid: %s' % cid, level=log.DEBUG) # getting the right id of the compound with which it can reach
# the seperate html page which contains the properties and their values
#using this cid to get the right url and scrape it
requests.append(Request(url=self.website_pubchem[:-2].replace("\\","") + self.data_url % cid, callback=self.parse_data))
# using this cid to get the right url and scrape it
requests.append(
Request(url=self.website_pubchem[:-2].replace("\\", "") + self.data_url % cid, callback=self.parse_data))
return requests
def parse_data(self, response):
@staticmethod
def parse_data(response):
"""
Parse data found in 'Chemical and Physical properties' part of a substance page.
:param response: The response with the page to parse
@ -116,7 +118,7 @@ class PubChem(Source):
case the search request forwarded to the compound page
"""
#check if pubchem forwarded straight to compound page
# check if pubchem forwarded straight to compound page
m = re.match(self.website_pubchem, response.url)
if m:
log.msg('PubChem search forwarded to compound page',

View File

@ -17,7 +17,7 @@ Options:
--version Show version.
-v Verbose logging output. (Multiple occurrences increase logging level)
--log=<file> Save log to an file.
-o <file> --output=<file> Output file [default: results.*format*]
-o <file> --output=<file> Output file [default: <compound>.*format*]
-f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: csv]
--include=<regex> Include only sources that match these regular expressions split by a comma.
--exclude=<regex> Exclude the sources that match these regular expressions split by a comma.
@ -58,7 +58,7 @@ def search(docopt_arguments, source_loader):
"""
conf = Configurator()
conf.set_logging(docopt_arguments["--log"], docopt_arguments["-v"])
conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"])
conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"], docopt_arguments["<compound>"])
setup_crawler(docopt_arguments["<compound>"], conf.scrapy_settings,
source_loader, docopt_arguments["--attributes"].split(','))
if conf.scrapy_settings.getbool("LOG_ENABLED"):

View File

@ -10,16 +10,16 @@ class TestConfigurator(unittest.TestCase):
self.conf = Configurator()
def test_set_output(self):
self.conf.set_output(filename="test.txt", fileformat="csv")
self.conf.set_output(filename="test.txt", fileformat="csv", compound="test")
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.txt")
self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv")
self.conf.set_output("results.*format*", "jsonlines")
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.json")
self.conf.set_output("<compound>.*format*", "jsonlines", "test")
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.json")
self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "jsonlines")
self.conf.set_output("results.*format*", "csv")
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.csv")
self.conf.set_output("results.*format*", "csv", "test")
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.csv")
self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv")
def test_start_log(self):

View File

@ -3,6 +3,7 @@ import os
from scrapy.utils.project import get_project_settings
class Configurator:
"""
A helper class in the fourmi class. This class is used to process the settings as set
@ -12,7 +13,7 @@ class Configurator:
def __init__(self):
self.scrapy_settings = get_project_settings()
def set_output(self, filename, fileformat):
def set_output(self, filename, fileformat, compound):
"""
This function manipulates the Scrapy output file settings that normally would be set in the settings file.
In the Fourmi project these are command line arguments.
@ -20,12 +21,12 @@ class Configurator:
:param fileformat: The format in which the output will be.
"""
if filename != 'results.*format*':
if filename != '<compound>.*format*':
self.scrapy_settings.overrides["FEED_URI"] = filename
elif fileformat == "jsonlines":
self.scrapy_settings.overrides["FEED_URI"] = "results.json"
self.scrapy_settings.overrides["FEED_URI"] = compound + ".json"
elif fileformat is not None:
self.scrapy_settings.overrides["FEED_URI"] = "results." + fileformat
self.scrapy_settings.overrides["FEED_URI"] = compound + "." + fileformat
if fileformat is not None:
self.scrapy_settings.overrides["FEED_FORMAT"] = fileformat