Clean up documentation in Wikipedia parsers
This commit is contained in:
parent
cb53c3941d
commit
bfa78f4697
@ -1,11 +1,9 @@
|
|||||||
import re
|
|
||||||
|
|
||||||
from scrapy.http import Request
|
from scrapy.http import Request
|
||||||
from scrapy import log
|
from scrapy import log
|
||||||
from scrapy.selector import Selector
|
|
||||||
|
|
||||||
from source import Source
|
from source import Source
|
||||||
|
from scrapy.selector import Selector
|
||||||
from FourmiCrawler.items import Result
|
from FourmiCrawler.items import Result
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
class WikipediaParser(Source):
|
class WikipediaParser(Source):
|
||||||
@ -26,7 +24,11 @@ class WikipediaParser(Source):
|
|||||||
self.cfg = config
|
self.cfg = config
|
||||||
|
|
||||||
def parse(self, response):
|
def parse(self, response):
|
||||||
""" Distributes the above described behaviour """
|
"""
|
||||||
|
Distributes the above described behaviour
|
||||||
|
:param response: The incoming search request
|
||||||
|
:return: Returns the found properties if response is unique or returns none if it's already known
|
||||||
|
"""
|
||||||
log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG)
|
log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG)
|
||||||
sel = Selector(response)
|
sel = Selector(response)
|
||||||
compound = sel.xpath('//h1[@id="firstHeading"]//span/text()').extract()[0] # makes sure to use main page
|
compound = sel.xpath('//h1[@id="firstHeading"]//span/text()').extract()[0] # makes sure to use main page
|
||||||
@ -38,7 +40,14 @@ class WikipediaParser(Source):
|
|||||||
return items
|
return items
|
||||||
|
|
||||||
def parse_infobox(self, sel):
|
def parse_infobox(self, sel):
|
||||||
""" scrape data from infobox on wikipedia. """
|
"""
|
||||||
|
Scrape data from infobox on wikipedia.
|
||||||
|
|
||||||
|
Data from two types of infoboxes: class="infobox bordered" and class="infobox" is scraped and
|
||||||
|
:param sel: The selector with the html-information of the page to parse
|
||||||
|
:return: item_list: Returns a list of properties with their values, source, etc..
|
||||||
|
"""
|
||||||
|
|
||||||
items = []
|
items = []
|
||||||
|
|
||||||
# be sure to get chembox (wikipedia template)
|
# be sure to get chembox (wikipedia template)
|
||||||
@ -54,7 +63,7 @@ class WikipediaParser(Source):
|
|||||||
items.append(item)
|
items.append(item)
|
||||||
log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG)
|
log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG)
|
||||||
|
|
||||||
#scrape the drugbox (wikipedia template)
|
#scrape the drugbox (wikipedia template)
|
||||||
tr_list2 = sel.xpath('.//table[@class="infobox"]//tr')
|
tr_list2 = sel.xpath('.//table[@class="infobox"]//tr')
|
||||||
log.msg('dit: %s' % tr_list2, level=log.DEBUG)
|
log.msg('dit: %s' % tr_list2, level=log.DEBUG)
|
||||||
for tablerow in tr_list2:
|
for tablerow in tr_list2:
|
||||||
@ -97,7 +106,15 @@ class WikipediaParser(Source):
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def clean_items(items):
|
def clean_items(items):
|
||||||
""" clean up properties using regex, makes it possible to split the values from the units """
|
|
||||||
|
"""
|
||||||
|
Clean up properties using regex, makes it possible to split the values from the units
|
||||||
|
|
||||||
|
Almost not in use, only cleans J/K/mol values and boiling/melting points.
|
||||||
|
|
||||||
|
:param items: List of properties with their values, source, etc..
|
||||||
|
:return: items: List of now cleaned up items
|
||||||
|
"""
|
||||||
for item in items:
|
for item in items:
|
||||||
value = item['value']
|
value = item['value']
|
||||||
m = re.search('F;\s(\d+[\.,]?\d*)', value) # clean up numerical Kelvin value (after F)
|
m = re.search('F;\s(\d+[\.,]?\d*)', value) # clean up numerical Kelvin value (after F)
|
||||||
@ -110,7 +127,12 @@ class WikipediaParser(Source):
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_identifiers(sel):
|
def get_identifiers(sel):
|
||||||
""" find external links, named 'Identifiers' to different sources. """
|
"""
|
||||||
|
Find external links, named 'Identifiers' to different sources.
|
||||||
|
|
||||||
|
:param sel: The selector with the html-information of the page to parse
|
||||||
|
:return: links: New links which can be used to expand the crawlers search
|
||||||
|
"""
|
||||||
links = sel.xpath('//span[contains(concat(" ",normalize-space(@class)," "),"reflink")]/a'
|
links = sel.xpath('//span[contains(concat(" ",normalize-space(@class)," "),"reflink")]/a'
|
||||||
'[contains(concat(" ",normalize-space(@class)," "),"external")]/@href').extract()
|
'[contains(concat(" ",normalize-space(@class)," "),"external")]/@href').extract()
|
||||||
return links
|
return links
|
||||||
|
Reference in New Issue
Block a user