Merge branch 'develop' of github.com:Recondor/Fourmi into develop

2014-05-22 12:17:56 +02:00 · 2014-05-22 12:17:56 +02:00 · 0e7e4cbe61
commit 0e7e4cbe61
parent 98f91a1aa9 af4c2ec7b2
5 changed files with 122 additions and 25 deletions
--- a/FourmiCrawler/sources/ChemSpider.py
+++ b/FourmiCrawler/sources/ChemSpider.py
@ -47,7 +47,6 @@ class ChemSpider(Source):
        properties = []
        # Predicted - ACD/Labs tab
        # [TODO] - test if tab contains data, some chemicals do not have data here
        td_list = sel.xpath('.//table[@id="acdlabs-table"]//td').xpath(
            'normalize-space(string())')
        prop_names = td_list[::2]
@ -58,6 +57,12 @@ class ChemSpider(Source):
            prop_value = prop_value.extract().encode('utf-8')
            prop_conditions = ''
            # Test for properties without values, with one hardcoded exception
            if (not re.match(r'^\d', prop_value) or
                    (prop_name == 'Polarizability' and
                    prop_value == '10-24cm3')):
                continue
            # Match for condition in parentheses
            m = re.match(r'(.*) \((.*)\)', prop_name)
            if m:
@ -192,6 +197,7 @@ class ChemSpider(Source):
                'reliability': 'Unknown',
                'conditions': ''
            })
            if result['value']:
                properties.append(result)
        return properties
@ -200,8 +206,14 @@ class ChemSpider(Source):
        sel = Selector(response)
        log.msg('chemspider parse_searchrequest', level=log.DEBUG)
        sel.register_namespace('cs', 'http://www.chemspider.com/')
-        csid = sel.xpath('.//cs:int/text()').extract()[0]
+        csids = sel.xpath('.//cs:int/text()').extract()
-        # [TODO] - handle multiple csids in case of vague search term
+        if len(csids) == 0:
            log.msg('ChemSpider found nothing', level=log.ERROR)
            return
        elif len(csids) > 1:
            log.msg('ChemSpider found multiple substances, taking first '
                    'element', level=log.DEBUG)
        csid = csids[0]
        structure_url = self.website[:-1] + self.structure % csid
        extendedinfo_url = self.website[:-1] + self.extendedinfo % csid
        log.msg('chemspider URL: %s' % structure_url, level=log.DEBUG)
--- a/FourmiCrawler/sources/WikipediaParser.py
+++ b/FourmiCrawler/sources/WikipediaParser.py
@ -36,8 +36,8 @@ class WikipediaParser(Source):
        """ scrape data from infobox on wikipedia. """
        items = []
-        #be sure to get both chembox (wikipedia template) and drugbox (wikipedia template) to scrape
+        #be sure to get chembox (wikipedia template)
-        tr_list = sel.xpath('.//table[@class="infobox bordered" or @class="infobox"]//td[not(@colspan)]').\
+        tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]'). \
            xpath('normalize-space(string())')
        prop_names = tr_list[::2]
        prop_values = tr_list[1::2]
@ -46,11 +46,31 @@ class WikipediaParser(Source):
                'attribute': prop_name.extract().encode('utf-8'),
                'value': prop_values[i].extract().encode('utf-8'),
                'source': "Wikipedia",
-                'reliability': "",
+                'reliability': "Unknown",
                'conditions': ""
            })
            items.append(item)
            log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG)
        #scrape the  drugbox (wikipedia template)
        tr_list2 = sel.xpath('.//table[@class="infobox"]//tr')
        log.msg('dit: %s' % tr_list2, level=log.DEBUG)
        for tablerow in tr_list2:
            log.msg('item: %s' % tablerow.xpath('./th').xpath('normalize-space(string())'), level=log.DEBUG)
            if tablerow.xpath('./th').xpath('normalize-space(string())') and tablerow.xpath('./td').xpath(
                    'normalize-space(string())'):
                item = Result({
                    'attribute': tablerow.xpath('./th').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
                    'value': tablerow.xpath('./td').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
                    'source': "Wikipedia",
                    'reliability': "Unknown",
                    'conditions': ""
                })
                items.append(item)
                log.msg(
                    'Wiki prop: |attribute: %s| |value: %s| |%s|' % (item['attribute'], item['value'], item['source']),
                    level=log.DEBUG)
        items = filter(lambda a: a['value'] != '', items)  # remove items with an empty value
        item_list = self.clean_items(items)
--- a/README.md
+++ b/README.md
@ -0,0 +1,81 @@
 # Fourmi
 Fourmi is an web scraper for chemical substances. The program is designed to be
 used as a search engine to search multiple chemical databases for a specific
 substance. The program will produce all available attributes of the substance
 and conditions associated with the attributes. Fourmi also attempts to estimate
 the reliability of each data point to assist the user in deciding which data
 should be used.
 The Fourmi project is open source project licensed under the MIT license. Feel
 free to contribute!
 Fourmi is based on the [Scrapy framework](http://scrapy.org/), an open source
 web scraping framework for python. Most of the functionality of this project can
 be traced to this framework. Should the documentation for this application fall
 short, we suggest you take a close look at the [Scrapy architecture]
 (http://doc.scrapy.org/en/latest/topics/architecture.html) and the [Scrapy
 documentation](http://doc.scrapy.org/en/latest/index.html).
 ### Installing 
 If you're installing Fourmi, please take a look at our [installation guide](...)
 on our wiki. When you've installed the application, make sure to check our
 [usage guide](...).
 ### Using the Source
 To use the Fourmi source code multiple dependencies are required. Take a look at
 the [wiki page](...) on using the application source code for a step by step
 installation guide.
 When developing for the Fourmi project keep in mind that code readability is a
 must. To maintain the readability, code should be conform with the
 [PEP-8](http://legacy.python.org/dev/peps/pep-0008/) style guide for Python
 code. More information about the different structures and principles of the
 Fourmi application can be found on our [wiki](...).
 ### To Do
 The Fourmi project has the following goals for the nearby future:
 __Main goals:__
 - Improve our documentation and guides. (Assignee: Dekker)
 - Build an graphical user interface(GUI) as alternative for the command line
 interface(CLI). (Assignee: Harmen)
 - Compiling the source into an windows executable. (Assignee: Bas)
 - Create an configuration file to hold logins and API keys.
 - Determine reliability of our data point.
 - Create an module to gather data from NIST. (Assignee: Rob)
 - Create an module to gather data from PubChem. (Assignee: Nout)
 __Side goals:__
 - Clean and unify data.
 - Extensive reliability analysis using statistical tests.
 - Test data with Descartes 1.
 ### Project Origin
 The Fourmi project was started in February of 2014 as part of a software
 engineering course at the Radboud University for students studying Computer
 Science, Information Science or Artificial Intelligence. Students participate in
 a real software development project as part of the
 [Giphouse](http://www.giphouse.nl/).
 This particular project was started on behalf of Ivo B. Rietveld. As a chemist
 he was in need of an application to automatically search information on chemical
 substances and create an phase diagram. The so called "Descrates" project was
 split into two teams each creating a different application that has part of the
 functionality. We are the team Descartes 2 and as we were responsible for
 creating a web crawler, we've named our application Fourmi (Englis: Ants).
 The following people were part of the original team:
 - [Jip J. Dekker](http://jip.dekker.li)
 - Rob ten Berge
 - Harmen Prins
 - Bas van Berkel
 - Nout van Deijck
 - Michail Kuznetcov
--- a/README.rst
+++ b/README.rst
@ -1,16 +0,0 @@
 We are the team Descartes 2.
 ----------------------------
 Our team members are:
 + Rob ten Berge
 + Bas van Berkel
 + Nout van Deijck
 + Jip J. Dekker
 + Michail Kuznetcov
 + Harmen Prins
--- a/fourmi.py
+++ b/fourmi.py
@ -80,7 +80,7 @@ def search(docopt_arguments, source_loader):
 if __name__ == '__main__':
-    arguments = docopt.docopt(__doc__, version='Fourmi - V0.2.6')
+    arguments = docopt.docopt(__doc__, version='Fourmi - V0.3.0')
    loader = SourceLoader()
    if arguments["--include"]: