Archived
1
0

Merge branch 'feature/compound-name' into develop

This commit is contained in:
Jip J. Dekker 2014-06-20 11:21:43 +02:00
commit 76bf634ad9
6 changed files with 43 additions and 39 deletions

View File

@ -313,7 +313,8 @@ class NIST(Source):
:param conditions: optional conditions regarding the value :param conditions: optional conditions regarding the value
:return: A Result item :return: A Result item
""" """
return Result({ return Result(
{
'attribute': attribute, 'attribute': attribute,
'value': value, 'value': value,
'source': 'NIST', 'source': 'NIST',

View File

@ -15,7 +15,7 @@ class PubChem(Source):
including sources of the values of properties. including sources of the values of properties.
""" """
#PubChem has its data on compound name, properties and their values on different html pages, so different URLs used # PubChem has its data on compound name, properties and their values on different html pages, so different URLs used
website = 'http://.*\\.ncbi\\.nlm\\.nih\\.gov/.*' website = 'http://.*\\.ncbi\\.nlm\\.nih\\.gov/.*'
website_www = 'http://www.ncbi.nlm.nih.gov/*' website_www = 'http://www.ncbi.nlm.nih.gov/*'
website_pubchem = 'http://pubchem.ncbi.nlm.nih.gov/.*' website_pubchem = 'http://pubchem.ncbi.nlm.nih.gov/.*'
@ -54,14 +54,16 @@ class PubChem(Source):
n = re.search(r'cid=(\d+)', response.url) n = re.search(r'cid=(\d+)', response.url)
if n: if n:
cid = n.group(1) cid = n.group(1)
log.msg('cid: %s' % cid, level=log.DEBUG) #getting the right id of the compound with which it can reach log.msg('cid: %s' % cid, level=log.DEBUG) # getting the right id of the compound with which it can reach
# the seperate html page which contains the properties and their values # the seperate html page which contains the properties and their values
#using this cid to get the right url and scrape it # using this cid to get the right url and scrape it
requests.append(Request(url=self.website_pubchem[:-2].replace("\\","") + self.data_url % cid, callback=self.parse_data)) requests.append(
Request(url=self.website_pubchem[:-2].replace("\\", "") + self.data_url % cid, callback=self.parse_data))
return requests return requests
def parse_data(self, response): @staticmethod
def parse_data(response):
""" """
Parse data found in 'Chemical and Physical properties' part of a substance page. Parse data found in 'Chemical and Physical properties' part of a substance page.
:param response: The response with the page to parse :param response: The response with the page to parse
@ -116,7 +118,7 @@ class PubChem(Source):
case the search request forwarded to the compound page case the search request forwarded to the compound page
""" """
#check if pubchem forwarded straight to compound page # check if pubchem forwarded straight to compound page
m = re.match(self.website_pubchem, response.url) m = re.match(self.website_pubchem, response.url)
if m: if m:
log.msg('PubChem search forwarded to compound page', log.msg('PubChem search forwarded to compound page',

View File

@ -17,7 +17,7 @@ Options:
--version Show version. --version Show version.
-v Verbose logging output. (Multiple occurrences increase logging level) -v Verbose logging output. (Multiple occurrences increase logging level)
--log=<file> Save log to an file. --log=<file> Save log to an file.
-o <file> --output=<file> Output file [default: results.*format*] -o <file> --output=<file> Output file [default: <compound>.*format*]
-f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: csv] -f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: csv]
--include=<regex> Include only sources that match these regular expressions split by a comma. --include=<regex> Include only sources that match these regular expressions split by a comma.
--exclude=<regex> Exclude the sources that match these regular expressions split by a comma. --exclude=<regex> Exclude the sources that match these regular expressions split by a comma.
@ -58,7 +58,7 @@ def search(docopt_arguments, source_loader):
""" """
conf = Configurator() conf = Configurator()
conf.set_logging(docopt_arguments["--log"], docopt_arguments["-v"]) conf.set_logging(docopt_arguments["--log"], docopt_arguments["-v"])
conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"]) conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"], docopt_arguments["<compound>"])
setup_crawler(docopt_arguments["<compound>"], conf.scrapy_settings, setup_crawler(docopt_arguments["<compound>"], conf.scrapy_settings,
source_loader, docopt_arguments["--attributes"].split(',')) source_loader, docopt_arguments["--attributes"].split(','))
if conf.scrapy_settings.getbool("LOG_ENABLED"): if conf.scrapy_settings.getbool("LOG_ENABLED"):

View File

@ -10,16 +10,16 @@ class TestConfigurator(unittest.TestCase):
self.conf = Configurator() self.conf = Configurator()
def test_set_output(self): def test_set_output(self):
self.conf.set_output(filename="test.txt", fileformat="csv") self.conf.set_output(filename="test.txt", fileformat="csv", compound="test")
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.txt") self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.txt")
self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv") self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv")
self.conf.set_output("results.*format*", "jsonlines") self.conf.set_output("<compound>.*format*", "jsonlines", "test")
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.json") self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.json")
self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "jsonlines") self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "jsonlines")
self.conf.set_output("results.*format*", "csv") self.conf.set_output("results.*format*", "csv", "test")
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.csv") self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.csv")
self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv") self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv")
def test_start_log(self): def test_start_log(self):

View File

@ -3,6 +3,7 @@ import os
from scrapy.utils.project import get_project_settings from scrapy.utils.project import get_project_settings
class Configurator: class Configurator:
""" """
A helper class in the fourmi class. This class is used to process the settings as set A helper class in the fourmi class. This class is used to process the settings as set
@ -12,7 +13,7 @@ class Configurator:
def __init__(self): def __init__(self):
self.scrapy_settings = get_project_settings() self.scrapy_settings = get_project_settings()
def set_output(self, filename, fileformat): def set_output(self, filename, fileformat, compound):
""" """
This function manipulates the Scrapy output file settings that normally would be set in the settings file. This function manipulates the Scrapy output file settings that normally would be set in the settings file.
In the Fourmi project these are command line arguments. In the Fourmi project these are command line arguments.
@ -20,12 +21,12 @@ class Configurator:
:param fileformat: The format in which the output will be. :param fileformat: The format in which the output will be.
""" """
if filename != 'results.*format*': if filename != '<compound>.*format*':
self.scrapy_settings.overrides["FEED_URI"] = filename self.scrapy_settings.overrides["FEED_URI"] = filename
elif fileformat == "jsonlines": elif fileformat == "jsonlines":
self.scrapy_settings.overrides["FEED_URI"] = "results.json" self.scrapy_settings.overrides["FEED_URI"] = compound + ".json"
elif fileformat is not None: elif fileformat is not None:
self.scrapy_settings.overrides["FEED_URI"] = "results." + fileformat self.scrapy_settings.overrides["FEED_URI"] = compound + "." + fileformat
if fileformat is not None: if fileformat is not None:
self.scrapy_settings.overrides["FEED_FORMAT"] = fileformat self.scrapy_settings.overrides["FEED_FORMAT"] = fileformat