From bfa78f4697bb4b49f8855d0ccefef2121abe64f2 Mon Sep 17 00:00:00 2001 From: Bas Vb Date: Tue, 10 Jun 2014 22:30:59 +0200 Subject: [PATCH] Clean up documentation in Wikipedia parsers --- FourmiCrawler/sources/WikipediaParser.py | 40 ++++++++++++++++++------ 1 file changed, 31 insertions(+), 9 deletions(-) diff --git a/FourmiCrawler/sources/WikipediaParser.py b/FourmiCrawler/sources/WikipediaParser.py index 8722cef..344f836 100644 --- a/FourmiCrawler/sources/WikipediaParser.py +++ b/FourmiCrawler/sources/WikipediaParser.py @@ -1,11 +1,9 @@ -import re - from scrapy.http import Request from scrapy import log -from scrapy.selector import Selector - from source import Source +from scrapy.selector import Selector from FourmiCrawler.items import Result +import re class WikipediaParser(Source): @@ -26,7 +24,11 @@ class WikipediaParser(Source): self.cfg = config def parse(self, response): - """ Distributes the above described behaviour """ + """ + Distributes the above described behaviour + :param response: The incoming search request + :return: Returns the found properties if response is unique or returns none if it's already known + """ log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG) sel = Selector(response) compound = sel.xpath('//h1[@id="firstHeading"]//span/text()').extract()[0] # makes sure to use main page @@ -38,7 +40,14 @@ class WikipediaParser(Source): return items def parse_infobox(self, sel): - """ scrape data from infobox on wikipedia. """ + """ + Scrape data from infobox on wikipedia. + + Data from two types of infoboxes: class="infobox bordered" and class="infobox" is scraped and + :param sel: The selector with the html-information of the page to parse + :return: item_list: Returns a list of properties with their values, source, etc.. + """ + items = [] # be sure to get chembox (wikipedia template) @@ -54,7 +63,7 @@ class WikipediaParser(Source): items.append(item) log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG) - #scrape the drugbox (wikipedia template) + #scrape the drugbox (wikipedia template) tr_list2 = sel.xpath('.//table[@class="infobox"]//tr') log.msg('dit: %s' % tr_list2, level=log.DEBUG) for tablerow in tr_list2: @@ -97,7 +106,15 @@ class WikipediaParser(Source): @staticmethod def clean_items(items): - """ clean up properties using regex, makes it possible to split the values from the units """ + + """ + Clean up properties using regex, makes it possible to split the values from the units + + Almost not in use, only cleans J/K/mol values and boiling/melting points. + + :param items: List of properties with their values, source, etc.. + :return: items: List of now cleaned up items + """ for item in items: value = item['value'] m = re.search('F;\s(\d+[\.,]?\d*)', value) # clean up numerical Kelvin value (after F) @@ -110,7 +127,12 @@ class WikipediaParser(Source): @staticmethod def get_identifiers(sel): - """ find external links, named 'Identifiers' to different sources. """ + """ + Find external links, named 'Identifiers' to different sources. + + :param sel: The selector with the html-information of the page to parse + :return: links: New links which can be used to expand the crawlers search + """ links = sel.xpath('//span[contains(concat(" ",normalize-space(@class)," "),"reflink")]/a' '[contains(concat(" ",normalize-space(@class)," "),"external")]/@href').extract() return links