Merge branch 'feature/compound-name' into develop

2014-06-20 11:21:43 +02:00 · 2014-06-20 11:21:43 +02:00 · 76bf634ad9
commit 76bf634ad9
parent 335f558aca 22ca4afa33
6 changed files with 43 additions and 39 deletions
--- a/FourmiCrawler/sources/ChemSpider.py
+++ b/FourmiCrawler/sources/ChemSpider.py
@ -89,7 +89,7 @@ class ChemSpider(Source):

            # Test for properties without values, with one hardcoded exception
            if (not re.match(r'^\d', prop_value) or
-                (prop_name == 'Polarizability' and prop_value == '10-24cm3')):
+                    (prop_name == 'Polarizability' and prop_value == '10-24cm3')):
                continue

            m = re.match(r'(.*) \((.*)\)', prop_name)
@ -122,12 +122,12 @@ class ChemSpider(Source):
        properties = []

        scraped_list = sel.xpath('.//li[span="Experimental Physico-chemical '
-                         'Properties"]//li/table/tr/td')
+                                 'Properties"]//li/table/tr/td')
        if not scraped_list:
            return properties
        # Format is: property name followed by a list of values
        property_name = scraped_list.pop(0).xpath(
-        'span/text()').extract()[0].rstrip()
+            'span/text()').extract()[0].rstrip()
        for line in scraped_list:
            if line.xpath('span/text()'):
                property_name = line.xpath('span/text()').extract()[0].rstrip()
@ -251,12 +251,12 @@ class ChemSpider(Source):
        :return: A Result item
        """
        return Result({
-                'attribute': attribute,
-                'value': value,
-                'source': source,
-                'reliability': self.cfg['reliability'],
-                'conditions': conditions
-            })
+            'attribute': attribute,
+            'value': value,
+            'source': source,
+            'reliability': self.cfg['reliability'],
+            'conditions': conditions
+        })

    def parse_searchrequest(self, response):
        """
--- a/FourmiCrawler/sources/NIST.py
+++ b/FourmiCrawler/sources/NIST.py
@ -313,12 +313,13 @@ class NIST(Source):
        :param conditions: optional conditions regarding the value
        :return: A Result item
        """
-        return Result({
-            'attribute': attribute,
-            'value': value,
-            'source': 'NIST',
-            'reliability': self.cfg['reliability'],
-            'conditions': conditions
+        return Result(
+            {
+                'attribute': attribute,
+                'value': value,
+                'source': 'NIST',
+                'reliability': self.cfg['reliability'],
+                'conditions': conditions
            })

    def new_compound_request(self, compound):
--- a/FourmiCrawler/sources/PubChem.py
+++ b/FourmiCrawler/sources/PubChem.py
@ -15,7 +15,7 @@ class PubChem(Source):
        including sources of the values of properties.
    """

-    #PubChem has its data on compound name, properties and their values on different html pages, so different URLs used
+    # PubChem has its data on compound name, properties and their values on different html pages, so different URLs used
    website = 'http://.*\\.ncbi\\.nlm\\.nih\\.gov/.*'
    website_www = 'http://www.ncbi.nlm.nih.gov/*'
    website_pubchem = 'http://pubchem.ncbi.nlm.nih.gov/.*'
@ -54,14 +54,16 @@ class PubChem(Source):
        n = re.search(r'cid=(\d+)', response.url)
        if n:
            cid = n.group(1)
-        log.msg('cid: %s' % cid, level=log.DEBUG)   #getting the right id of the compound with which it can reach
-                                                # the seperate html page which contains the properties and their values
+        log.msg('cid: %s' % cid, level=log.DEBUG)  # getting the right id of the compound with which it can reach
+        # the seperate html page which contains the properties and their values

-        #using this cid to get the right url and scrape it
-        requests.append(Request(url=self.website_pubchem[:-2].replace("\\","") + self.data_url % cid, callback=self.parse_data))
+        # using this cid to get the right url and scrape it
+        requests.append(
+            Request(url=self.website_pubchem[:-2].replace("\\", "") + self.data_url % cid, callback=self.parse_data))
        return requests

-    def parse_data(self, response):
+    @staticmethod
+    def parse_data(response):
        """
        Parse data found in 'Chemical and Physical properties' part of a substance page.
        :param response: The response with the page to parse
@ -74,8 +76,8 @@ class PubChem(Source):
        props = sel.xpath('//div')

        for prop in props:
-            prop_name = ''.join(prop.xpath('b/text()').extract()) # name of property that it is parsing
-            if prop.xpath('a'):     # parsing for single value in property
+            prop_name = ''.join(prop.xpath('b/text()').extract())  # name of property that it is parsing
+            if prop.xpath('a'):  # parsing for single value in property
                prop_source = ''.join(prop.xpath('a/@title').extract())
                prop_value = ''.join(prop.xpath('a/text()').extract())
                new_prop = Result({
@ -89,7 +91,7 @@ class PubChem(Source):
                        (new_prop['attribute'], new_prop['value'],
                         new_prop['source']), level=log.DEBUG)
                requests.append(new_prop)
-            elif prop.xpath('ul'):    # parsing for multiple values (list) in property
+            elif prop.xpath('ul'):  # parsing for multiple values (list) in property
                prop_values = prop.xpath('ul//li')
                for prop_li in prop_values:
                    prop_value = ''.join(prop_li.xpath('a/text()').extract())
@ -102,8 +104,8 @@ class PubChem(Source):
                        'conditions': ''
                    })
                    log.msg('PubChem prop: |%s| |%s| |%s|' %
-                        (new_prop['attribute'], new_prop['value'],
-                         new_prop['source']), level=log.DEBUG)
+                            (new_prop['attribute'], new_prop['value'],
+                             new_prop['source']), level=log.DEBUG)
                    requests.append(new_prop)

        return requests
@ -116,7 +118,7 @@ class PubChem(Source):
                 case the search request forwarded to the compound page
        """

-        #check if pubchem forwarded straight to compound page
+        # check if pubchem forwarded straight to compound page
        m = re.match(self.website_pubchem, response.url)
        if m:
            log.msg('PubChem search forwarded to compound page',
--- a/fourmi.py
+++ b/fourmi.py
@ -17,7 +17,7 @@ Options:
    --version                       Show version.
    -v                              Verbose logging output. (Multiple occurrences increase logging level)
    --log=<file>                    Save log to an file.
-    -o <file> --output=<file>       Output file [default: results.*format*]
+    -o <file> --output=<file>       Output file [default: <compound>.*format*]
    -f <format> --format=<format>   Output formats (supported: csv, json, jsonlines, xml) [default: csv]
    --include=<regex>               Include only sources that match these regular expressions split by a comma.
    --exclude=<regex>               Exclude the sources that match these regular expressions split by a comma.
@ -58,12 +58,12 @@ def search(docopt_arguments, source_loader):
    """
    conf = Configurator()
    conf.set_logging(docopt_arguments["--log"], docopt_arguments["-v"])
-    conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"])
+    conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"], docopt_arguments["<compound>"])
    setup_crawler(docopt_arguments["<compound>"], conf.scrapy_settings,
                  source_loader, docopt_arguments["--attributes"].split(','))
    if conf.scrapy_settings.getbool("LOG_ENABLED"):
        log.start(conf.scrapy_settings.get("LOG_FILE"),
-              conf.scrapy_settings.get("LOG_LEVEL"), conf.scrapy_settings.get("LOG_STDOUT"))
+                  conf.scrapy_settings.get("LOG_LEVEL"), conf.scrapy_settings.get("LOG_STDOUT"))
    reactor.run()


--- a/tests/test_configurator.py
+++ b/tests/test_configurator.py
@ -10,16 +10,16 @@ class TestConfigurator(unittest.TestCase):
        self.conf = Configurator()

    def test_set_output(self):
-        self.conf.set_output(filename="test.txt", fileformat="csv")
+        self.conf.set_output(filename="test.txt", fileformat="csv", compound="test")
        self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.txt")
        self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv")

-        self.conf.set_output("results.*format*", "jsonlines")
-        self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.json")
+        self.conf.set_output("<compound>.*format*", "jsonlines", "test")
+        self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.json")
        self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "jsonlines")

-        self.conf.set_output("results.*format*", "csv")
-        self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.csv")
+        self.conf.set_output("results.*format*", "csv", "test")
+        self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.csv")
        self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv")

    def test_start_log(self):
--- a/utils/configurator.py
+++ b/utils/configurator.py
@ -3,6 +3,7 @@ import os

 from scrapy.utils.project import get_project_settings

+
 class Configurator:
    """
    A helper class in the fourmi class. This class is used to process the settings as set
@ -12,7 +13,7 @@ class Configurator:
    def __init__(self):
        self.scrapy_settings = get_project_settings()

-    def set_output(self, filename, fileformat):
+    def set_output(self, filename, fileformat, compound):
        """
        This function manipulates the Scrapy output file settings that normally would be set in the settings file.
        In the Fourmi project these are command line arguments.
@ -20,12 +21,12 @@ class Configurator:
        :param fileformat: The format in which the output will be.
        """

-        if filename != 'results.*format*':
+        if filename != '<compound>.*format*':
            self.scrapy_settings.overrides["FEED_URI"] = filename
        elif fileformat == "jsonlines":
-            self.scrapy_settings.overrides["FEED_URI"] = "results.json"
+            self.scrapy_settings.overrides["FEED_URI"] = compound + ".json"
        elif fileformat is not None:
-            self.scrapy_settings.overrides["FEED_URI"] = "results." + fileformat
+            self.scrapy_settings.overrides["FEED_URI"] = compound + "." + fileformat

        if fileformat is not None:
            self.scrapy_settings.overrides["FEED_FORMAT"] = fileformat