Merge branch 'develop' into feature/Wikipedia

2014-04-23 22:53:28 +02:00 · 2014-04-23 22:53:28 +02:00 · 964e0b8ade
commit 964e0b8ade
parent 9cbdf57238 5fef9937cb
7 changed files with 330 additions and 50 deletions
--- a/FourmiCrawler/sources/ChemSpider.py
+++ b/FourmiCrawler/sources/ChemSpider.py
@ -0,0 +1,218 @@
+from source import Source
+from scrapy import log
+from scrapy.http import Request
+from scrapy.selector import Selector
+from FourmiCrawler.items import Result
+import re
+
+# [TODO] - Maybe clean up usage of '.extract()[0]', because of possible IndexError exception.
+
+
+class ChemSpider(Source):
+    """ChemSpider scraper for synonyms and properties
+
+    This parser will manage searching for chemicals through the
+    ChemsSpider API, and parsing the resulting ChemSpider page.
+    The token required for the API should be in a configuration file
+    somewhere.
+    """
+
+    def __init__(self):
+        Source.__init__(self)
+
+    website = 'http://www.chemspider.com/*'
+
+    # [TODO] - Save and access token of specific user.
+    search = ('Search.asmx/SimpleSearch?query=%s&token='
+              '052bfd06-5ce4-43d6-bf12-89eabefd2338')
+    structure = 'Chemical-Structure.%s.html'
+    extendedinfo = ('MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token='
+                    '052bfd06-5ce4-43d6-bf12-89eabefd2338')
+
+    ignore_list = []
+
+    def parse(self, response):
+        sel = Selector(response)
+        requests = []
+        requests_synonyms = self.parse_synonyms(sel)
+        requests.extend(requests_synonyms)
+        requests_properties = self.parse_properties(sel)
+        requests.extend(requests_properties)
+
+        return requests
+
+    @staticmethod
+    def parse_properties(sel):
+        """scrape Experimental Data and Predicted ACD/Labs tabs"""
+        properties = []
+
+        # Predicted - ACD/Labs tab
+        # [TODO] - test if tab contains data, some chemicals do not have data here
+        td_list = sel.xpath('.//table[@id="acdlabs-table"]//td').xpath(
+            'normalize-space(string())')
+        prop_names = td_list[::2]
+        prop_values = td_list[1::2]
+        for (prop_name, prop_value) in zip(prop_names, prop_values):
+            # [:-1] is to remove the colon at the end, [TODO] - test for colon
+            prop_name = prop_name.extract().encode('utf-8')[:-1]
+            prop_value = prop_value.extract().encode('utf-8')
+            prop_conditions = ''
+
+            # Match for condition in parentheses
+            m = re.match(r'(.*) \((.*)\)', prop_name)
+            if m:
+                prop_name = m.group(1)
+                prop_conditions = m.group(2)
+
+            # Match for condition in value seperated by an 'at'
+            m = re.match(r'(.*) at (.*)', prop_value)
+            if m:
+                prop_value = m.group(1)
+                prop_conditions = m.group(2)
+
+            new_prop = Result({
+                'attribute': prop_name,
+                'value': prop_value,
+                'source': 'ChemSpider Predicted - ACD/Labs Tab',
+                'reliability': 'Unknown',
+                'conditions': prop_conditions
+            })
+            properties.append(new_prop)
+            log.msg('CS prop: |%s| |%s| |%s|' %
+                    (new_prop['attribute'], new_prop['value'], new_prop['source']),
+                    level=log.DEBUG)
+
+        # Experimental Data Tab, Physico-chemical properties in particular
+        scraped_list = sel.xpath('.//li[span="Experimental Physico-chemical '
+                                 'Properties"]//li/table/tr/td')
+        if not scraped_list:
+            return properties
+        # Format is: property name followed by a list of values
+        property_name = scraped_list.pop(0).xpath(
+            'span/text()').extract()[0].rstrip()
+        for line in scraped_list:
+            if line.xpath('span/text()'):
+                property_name = line.xpath('span/text()').extract()[0].rstrip()
+            else:
+                new_prop = Result({
+                    'attribute': property_name[:-1],
+                    'value': line.xpath('text()').extract()[0].rstrip(),
+                    'source': line.xpath(
+                        'strong/text()').extract()[0].rstrip(),
+                    'reliability': 'Unknown',
+                    'conditions': ''
+                })
+                properties.append(new_prop)
+                log.msg('CS prop: |%s| |%s| |%s|' %
+                        (new_prop['attribute'], new_prop['value'],
+                         new_prop['source']), level=log.DEBUG)
+
+        return properties
+
+    def parse_synonyms(self, sel):
+        """Scrape list of Names and Identifiers"""
+        requests = []
+        synonyms = []
+
+        # Exact type for this is unknown, but equivalent to Validated by Expert
+        for syn in sel.xpath('//p[@class="syn"][span[@class="synonym_cn"]]'):
+            name = syn.xpath('span[@class="synonym_cn"]/text()').extract()[0]
+            synonyms.append(self.new_synonym(syn, name, 'expert'))
+        # These synonyms are labeled by ChemSpider as "Validated by Experts"
+        for syn in sel.xpath('//p[@class="syn"][strong]'):
+            name = syn.xpath('strong/text()').extract()[0]
+            synonyms.append(self.new_synonym(syn, name, 'expert'))
+        # These synonyms are labeled by ChemSpider as "Validated by Users"
+        for syn in sel.xpath(
+                '//p[@class="syn"][span[@class="synonym_confirmed"]]'):
+            name = syn.xpath(
+                'span[@class="synonym_confirmed"]/text()').extract()[0]
+            synonyms.append(self.new_synonym(syn, name, 'user'))
+        # These syonyms are labeled as "Non-validated" and assumed unreliable
+        for syn in sel.xpath('//p[@class="syn"][span[@class=""]]'):
+            name = syn.xpath('span[@class=""]/text()').extract()[0]
+            synonyms.append(self.new_synonym(syn, name, 'nonvalidated'))
+
+        # [TODO] - confirm if English User-Validated synonyms are OK too
+        for syn in synonyms:
+            if syn['category'] == 'expert' and syn['language'] == 'English':
+                log.msg('CS emit synonym: %s' % syn['name'], level=log.DEBUG)
+                self._spider.get_synonym_requests(syn['name'])
+
+        return requests
+
+    def new_synonym(self, sel, name, category):
+        """Scrape for a single synonym at a given HTML tag"""
+        self.ignore_list.append(name)
+        language = sel.xpath('span[@class="synonym_language"]/text()')
+        if language:
+            # The [1:-1] is to remove brackets around the language name
+            language = language.extract()[0][1:-1]
+        else:
+            # If language is not given, English is assumed, [TODO] - confirm
+            language = 'English'
+        log.msg('CS synonym: %s (%s) (%s)' % (name, category, language),
+                level=log.DEBUG)
+        references = []
+        # A synonym can have multiple references, each optionally with link
+        for ref in sel.xpath('span[@class="synonym_ref"]'):
+            refname = ref.xpath('normalize-space(string())')
+            references.append({
+                'name': refname.extract()[0][1:-1],
+                'URI': ''
+            })
+        for ref in sel.xpath('a[@class="synonym_ref"]'):
+            references.append({
+                'name': ref.xpath('@title').extract()[0],
+                'URI': ref.xpath('@href').extract()[0]
+            })
+        for ref in references:
+            log.msg('CS synonym ref: %s %s' % (ref['name'], ref['URI']),
+                    level=log.DEBUG)
+        synonym = {
+            'name': name,
+            'category': category,
+            'language': language,
+            'references': references
+        }
+        return synonym
+
+    @staticmethod
+    def parse_extendedinfo(response):
+        """Scrape data from the ChemSpider GetExtendedCompoundInfo API"""
+        sel = Selector(response)
+        properties = []
+        names = sel.xpath('*').xpath('name()').extract()
+        values = sel.xpath('*').xpath('text()').extract()
+        for (name, value) in zip(names, values):
+            result = Result({
+                'attribute': name,
+                'value': value,  # These values have no unit!
+                'source': 'ChemSpider ExtendedCompoundInfo',
+                'reliability': 'Unknown',
+                'conditions': ''
+            })
+            properties.append(result)
+        return properties
+
+    def parse_searchrequest(self, response):
+        """Parse the initial response of the ChemSpider Search API """
+        sel = Selector(response)
+        log.msg('chemspider parse_searchrequest', level=log.DEBUG)
+        sel.register_namespace('cs', 'http://www.chemspider.com/')
+        csid = sel.xpath('.//cs:int/text()').extract()[0]
+        # [TODO] - handle multiple csids in case of vague search term
+        structure_url = self.website[:-1] + self.structure % csid
+        extendedinfo_url = self.website[:-1] + self.extendedinfo % csid
+        log.msg('chemspider URL: %s' % structure_url, level=log.DEBUG)
+        return [Request(url=structure_url,
+                        callback=self.parse),
+                Request(url=extendedinfo_url,
+                        callback=self.parse_extendedinfo)]
+
+    def new_compound_request(self, compound):
+        if compound in self.ignore_list:  # [TODO] - add regular expression
+            return None
+        searchurl = self.website[:-1] + self.search % compound
+        log.msg('chemspider compound', level=log.DEBUG)
+        return Request(url=searchurl, callback=self.parse_searchrequest)
--- a/FourmiCrawler/sources/init.py
+++ b/FourmiCrawler/sources/init.py
--- a/FourmiCrawler/sources/source.py
+++ b/FourmiCrawler/sources/source.py
@ -2,12 +2,12 @@ from scrapy import log
 # from scrapy.http import Request


-class Parser:
-    '''
-    website should be an regular expression of the urls of request the parser is able to parse.
-    '''
-    website = "http://something/*"
-    __spider = None
+class Source:
+    website = "http://something/*"  # Regex of URI's the source is able to parse
+    _spider = None
+
+    def __init__(self):
+        pass

    def parse(self, reponse):
        log.msg("The parse function of the empty parser was used.", level=log.WARNING)
@ -18,4 +18,4 @@ class Parser:
        pass

    def set_spider(self, spider):
-        self.__spider = spider
+        self._spider = spider
--- a/FourmiCrawler/spider.py
+++ b/FourmiCrawler/spider.py
@ -15,7 +15,7 @@ class FourmiSpider(Spider):
    def parse(self, reponse):
        for parser in self.__parsers:
            if re.match(parser.website, reponse.url):
-                log.msg("Url: " + reponse.url + " -> Parser: " + parser.website, level=log.DEBUG)
+                log.msg("Url: " + reponse.url + " -> Source: " + parser.website, level=log.DEBUG)
                return parser.parse(reponse)
        return None

--- a/21
+++ b/21
@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2014 Ivo B. Rietveld
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/fourmi.py
+++ b/fourmi.py
@ -5,6 +5,9 @@ Fourmi, a web scraper build to search specific information for a given compound
 Usage:
    fourmi search <compound>
    fourmi [options] search <compound>
+    fourmi [options] [--include=<sourcename> | --exclude=<sourcename>] search <compound>
+    fourmi list
+    fourmi [--include=<sourcename> | --exclude=<sourcename>] list
    fourmi -h | --help
    fourmi --version

@ -15,40 +18,23 @@ Options:
    --log=<file>                    Save log to an file.
    -o <file> --output=<file>       Output file [default: result.*format*]
    -f <format> --format=<format>   Output formats (supported: csv, json, jsonlines, xml) [default: jsonlines]
+    --include=<sourcenames>         Include only sources that match the regular these expressions split by a comma.
+    --exclude=<sourcenames>         Exclude the sources that match the regular these expressions split by a comma.
 """

-import os
-import inspect
-
 from twisted.internet import reactor
 from scrapy.crawler import Crawler
 from scrapy import log, signals
 from scrapy.utils.project import get_project_settings
 import docopt

-from FourmiCrawler.parsers.parser import Parser
 from FourmiCrawler.spider import FourmiSpider
+from sourceloader import SourceLoader


-def load_parsers(rel_dir="FourmiCrawler/parsers"):
-    path = os.path.dirname(os.path.abspath(__file__))
-    path += "/" + rel_dir
-    parsers = []
-    known_parser = set()
-
-    for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']:
-        mod = __import__('.'.join([rel_dir.replace("/", "."), py]), fromlist=[py])
-        classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))]
-        for cls in classes:
-            if issubclass(cls, Parser) and cls not in known_parser:
-                parsers.append(cls())  # [review] - Would we ever need arguments for the parsers?
-                known_parser.add(cls)
-    return parsers
-
-
-def setup_crawler(searchable, settings):
+def setup_crawler(searchable, settings, source_loader):
    spider = FourmiSpider(compound=searchable)
-    spider.add_parsers(load_parsers())
+    spider.add_parsers(source_loader.sources)
    crawler = Crawler(settings)
    crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
    crawler.configure()
@ -56,39 +42,53 @@ def setup_crawler(searchable, settings):
    crawler.start()


-def scrapy_settings_manipulation(arguments):
+def scrapy_settings_manipulation(docopt_arguments):
    settings = get_project_settings()
-
-    if arguments["--output"] != 'result.*format*':
-        settings.overrides["FEED_URI"] = arguments["--output"]
-    elif arguments["--format"] == "jsonlines":
+    # [todo] - add at least a warning for files that already exist
+    if docopt_arguments["--output"] != 'result.*format*':
+        settings.overrides["FEED_URI"] = docopt_arguments["--output"]
+    elif docopt_arguments["--format"] == "jsonlines":
        settings.overrides["FEED_URI"] = "results.json"
-    elif arguments["--format"] is not None:
-        settings.overrides["FEED_URI"] = "results." + arguments["--format"]
+    elif docopt_arguments["--format"] is not None:
+        settings.overrides["FEED_URI"] = "results." + docopt_arguments["--format"]

-    if arguments["--format"] is not None:
-        settings.overrides["FEED_FORMAT"] = arguments["--format"]
+    if docopt_arguments["--format"] is not None:
+        settings.overrides["FEED_FORMAT"] = docopt_arguments["--format"]

    return settings


-def start_log(arguments):
-    if arguments["--log"] is not None:
-        if arguments["--verbose"]:
-            log.start(logfile=arguments["--log"], logstdout=False, loglevel=log.DEBUG)
+def start_log(docopt_arguments):
+    if docopt_arguments["--log"] is not None:
+        if docopt_arguments["--verbose"]:
+            log.start(logfile=docopt_arguments["--log"], logstdout=False, loglevel=log.DEBUG)
        else:
-            log.start(logfile=arguments["--log"], logstdout=True, loglevel=log.WARNING)
+            log.start(logfile=docopt_arguments["--log"], logstdout=True, loglevel=log.WARNING)
    else:
-        if arguments["--verbose"]:
+        if docopt_arguments["--verbose"]:
            log.start(logstdout=False, loglevel=log.DEBUG)
        else:
            log.start(logstdout=True, loglevel=log.WARNING)


-if __name__ == '__main__':
-    arguments = docopt.docopt(__doc__, version='Fourmi - V0.1.0')
-    start_log(arguments)
-    settings = scrapy_settings_manipulation(arguments)
-    setup_crawler(arguments["<compound>"], settings)
+def search(docopt_arguments, source_loader):
+    start_log(docopt_arguments)
+    settings = scrapy_settings_manipulation(docopt_arguments)
+    setup_crawler(docopt_arguments["<compound>"], settings, source_loader)
    reactor.run()

+
+if __name__ == '__main__':
+    arguments = docopt.docopt(__doc__, version='Fourmi - V0.2.5')
+    loader = SourceLoader()
+
+    if arguments["--include"]:
+        loader.include(arguments["--include"].split(','))
+    elif arguments["--exclude"]:
+        loader.exclude(arguments["--exclude"].split(','))
+
+    if arguments["search"]:
+        search(arguments, loader)
+    elif arguments["list"]:
+        print "-== Available Sources ==-"
+        print str(loader)
--- a/sourceloader.py
+++ b/sourceloader.py
@ -0,0 +1,41 @@
+import inspect
+import os
+import re
+from FourmiCrawler.sources.source import Source
+
+
+class SourceLoader:
+    sources = []
+
+    def __init__(self, rel_dir="FourmiCrawler/sources"):
+        path = os.path.dirname(os.path.abspath(__file__))
+        path += "/" + rel_dir
+        known_parser = set()
+
+        for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']:
+            mod = __import__('.'.join([rel_dir.replace("/", "."), py]), fromlist=[py])
+            classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))]
+            for cls in classes:
+                if issubclass(cls, Source) and cls not in known_parser:
+                    self.sources.append(cls())  # [review] - Would we ever need arguments for the parsers?
+                    known_parser.add(cls)
+
+    def include(self, source_names):
+        new = set()
+        for name in source_names:
+            new.update([src for src in self.sources if re.match(name, src.__class__.__name__)])
+        self.sources = list(new)
+
+    def exclude(self, source_names):
+        exclude = []
+        for name in source_names:
+            exclude.extend([src for src in self.sources if re.match(name, src.__class__.__name__)])
+        self.sources = [src for src in self.sources if src not in exclude]
+
+    def __str__(self):
+        string = ""
+        for src in self.sources:
+            string += "Source: " + src.__class__.__name__
+            string += " - "
+            string += "URI: " + src.website + "\n"
+        return string