added comments

2014-04-23 16:17:23 +02:00 · 2014-04-23 16:17:23 +02:00 · 150fc5bea7
commit 150fc5bea7
parent 9cefd336e0
1 changed files with 23 additions and 11 deletions
--- a/FourmiCrawler/parsers/WikipediaParser.py
+++ b/FourmiCrawler/parsers/WikipediaParser.py
@ -8,9 +8,11 @@ import re

 class WikipediaParser(Parser):

-# General notes:
-# Redirects seem to not matter as Wikipedia returns the page the redirect forwards to
-# although this might lead to scraping both the original and the redirect with the same data.
+    """ Wikipedia scraper for chemical properties
+
+    This parser parses Wikipedia infoboxes (also bordered) to obtain properties and their values.
+     It also returns requests with other external sources which contain information on parsed subject.
+    """

    website = "http://en.wikipedia.org/wiki/*"
    __spider = None
@ -31,10 +33,12 @@ class WikipediaParser(Parser):
            return items

    def parse_infobox(self, sel):
-
+        #scrape data from infobox on wikipedia.
        items = []

-        tr_list = sel.xpath('.//table[@class="infobox bordered" or @class="infobox"]//td[not(@colspan)]').xpath('normalize-space(string())')
+        #be sure to get both chembox (wikipedia template) and drugbox (wikipedia template) to scrape
+        tr_list = sel.xpath('.//table[@class="infobox bordered" or @class="infobox"]//td[not(@colspan)]').\
+            xpath('normalize-space(string())')
        prop_names = tr_list[::2]
        prop_values = tr_list[1::2]
        for i, prop_name in enumerate(prop_names):
@ -52,10 +56,14 @@ class WikipediaParser(Parser):

        identifiers = self.get_identifiers(sel)

+        #add extra sources to scrape from as requests
        for i, identifier in enumerate(identifiers):
-            if re.match('//en\.wikipedia',identifier):
+            request = None
+            #discard internal wikipedia links
+            if re.match('//en\.wikipedia', identifier):
                log.msg('Found link to wikipedia, this is not something to scrape: %s' % identifier, level=log.WARNING)
-            elif re.match('/{2}',identifier):
+            #fix links starting with '//www.'
+            elif re.match('/{2}', identifier):
                identifier = re.sub("/{2}", "http://", identifier)
                request = Request(identifier)
            else:
@ -68,18 +76,22 @@ class WikipediaParser(Parser):
    def new_compound_request(self, compound):
        return Request(url=self.website[:-1] + compound, callback=self.parse)

-    def cleanitems(self, items):
+    @staticmethod
+    def cleanitems(items):
+        #clean up properties using regex, makes it possible to split the values from the units
        for item in items:
            value = item['value']
-            m = re.search('F;\s(\d+[\.,]?\d*)', value)
+            m = re.search('F;\s(\d+[\.,]?\d*)', value)  # clean up numerical Kelvin value (after F)
            if m:
                item['value'] = m.group(1) + " K"
-            m = re.match('(\d+[\.,]?\d*)\sJ\sK.+mol', value)
+            m = re.match('(\d+[\.,]?\d*)\sJ\sK.+mol', value)  # clean up J/K/mol values
            if m:
                item['value'] = m.group(1) + " J/K/mol"
        return items

-    def get_identifiers(self, sel):
+    @staticmethod
+    def get_identifiers(sel):
+        #find external links, named 'Identifiers' to different sources.
        links = sel.xpath('//span[contains(concat(" ",normalize-space(@class)," "),"reflink")]/a'
                          '[contains(concat(" ",normalize-space(@class)," "),"external")]/@href').extract()
        return links