diff --git a/FourmiCrawler/parsers/WikipediaParser.py b/FourmiCrawler/parsers/WikipediaParser.py index b671197..3bcf786 100644 --- a/FourmiCrawler/parsers/WikipediaParser.py +++ b/FourmiCrawler/parsers/WikipediaParser.py @@ -8,9 +8,11 @@ import re class WikipediaParser(Parser): -# General notes: -# Redirects seem to not matter as Wikipedia returns the page the redirect forwards to -# although this might lead to scraping both the original and the redirect with the same data. + """ Wikipedia scraper for chemical properties + + This parser parses Wikipedia infoboxes (also bordered) to obtain properties and their values. + It also returns requests with other external sources which contain information on parsed subject. + """ website = "http://en.wikipedia.org/wiki/*" __spider = None @@ -31,10 +33,12 @@ class WikipediaParser(Parser): return items def parse_infobox(self, sel): - + #scrape data from infobox on wikipedia. items = [] - tr_list = sel.xpath('.//table[@class="infobox bordered" or @class="infobox"]//td[not(@colspan)]').xpath('normalize-space(string())') + #be sure to get both chembox (wikipedia template) and drugbox (wikipedia template) to scrape + tr_list = sel.xpath('.//table[@class="infobox bordered" or @class="infobox"]//td[not(@colspan)]').\ + xpath('normalize-space(string())') prop_names = tr_list[::2] prop_values = tr_list[1::2] for i, prop_name in enumerate(prop_names): @@ -52,10 +56,14 @@ class WikipediaParser(Parser): identifiers = self.get_identifiers(sel) + #add extra sources to scrape from as requests for i, identifier in enumerate(identifiers): - if re.match('//en\.wikipedia',identifier): + request = None + #discard internal wikipedia links + if re.match('//en\.wikipedia', identifier): log.msg('Found link to wikipedia, this is not something to scrape: %s' % identifier, level=log.WARNING) - elif re.match('/{2}',identifier): + #fix links starting with '//www.' + elif re.match('/{2}', identifier): identifier = re.sub("/{2}", "http://", identifier) request = Request(identifier) else: @@ -68,18 +76,22 @@ class WikipediaParser(Parser): def new_compound_request(self, compound): return Request(url=self.website[:-1] + compound, callback=self.parse) - def cleanitems(self, items): + @staticmethod + def cleanitems(items): + #clean up properties using regex, makes it possible to split the values from the units for item in items: value = item['value'] - m = re.search('F;\s(\d+[\.,]?\d*)', value) + m = re.search('F;\s(\d+[\.,]?\d*)', value) # clean up numerical Kelvin value (after F) if m: item['value'] = m.group(1) + " K" - m = re.match('(\d+[\.,]?\d*)\sJ\sK.+mol', value) + m = re.match('(\d+[\.,]?\d*)\sJ\sK.+mol', value) # clean up J/K/mol values if m: item['value'] = m.group(1) + " J/K/mol" return items - def get_identifiers(self, sel): + @staticmethod + def get_identifiers(sel): + #find external links, named 'Identifiers' to different sources. links = sel.xpath('//span[contains(concat(" ",normalize-space(@class)," "),"reflink")]/a' '[contains(concat(" ",normalize-space(@class)," "),"external")]/@href').extract() return links \ No newline at end of file