151 lines
5.3 KiB
Python
151 lines
5.3 KiB
Python
from source import Source
|
|
from scrapy import log
|
|
from scrapy.http import Request
|
|
from scrapy.selector import Selector
|
|
from FourmiCrawler.items import Result
|
|
import re
|
|
|
|
class NIST(Source):
|
|
website = "http://webbook.nist.gov/*"
|
|
|
|
search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on'
|
|
|
|
def __init__(self):
|
|
Source.__init__(self)
|
|
|
|
def parse(self, response):
|
|
sel = Selector(response)
|
|
|
|
requests = []
|
|
|
|
symbol_table = {}
|
|
tds = sel.xpath('//table[@class="symbol_table"]/tr/td')
|
|
for (symbol_td, name_td) in zip(tds[::2], tds[1::2]):
|
|
symbol = ''.join(symbol_td.xpath('node()').extract())
|
|
name = name_td.xpath('text()').extract()[0]
|
|
symbol_table[symbol] = name
|
|
log.msg('NIST symbol: |%s|, name: |%s|' % (symbol, name),
|
|
level=log.DEBUG)
|
|
|
|
for tables in sel.xpath('//table[@class="data"]'):
|
|
if tables.xpath('@summary').extract()[0] == 'One dimensional data':
|
|
log.msg('NIST table: Aggregrate data', level=log.DEBUG)
|
|
requests.extend(
|
|
self.parse_aggregate_data(tables, symbol_table))
|
|
elif tables.xpath('tr/th="Initial Phase"').extract()[0] == '1':
|
|
log.msg('NIST table; Enthalpy/entropy of phase transition',
|
|
level=log.DEBUG)
|
|
requests.extend(
|
|
self.parse_transition_data(tables, symbol_table))
|
|
elif tables.xpath('tr[1]/td'):
|
|
log.msg('NIST table: Horizontal table', level=log.DEBUG)
|
|
elif (tables.xpath('@summary').extract()[0] ==
|
|
'Antoine Equation Parameters'):
|
|
log.msg('NIST table: Antoine Equation Parameters',
|
|
level=log.DEBUG)
|
|
requests.extend(
|
|
self.parse_antoine_data(tables))
|
|
elif len(tables.xpath('tr[1]/th')) == 5:
|
|
log.msg('NIST table: generic 5 columns', level=log.DEBUG)
|
|
# Symbol (unit) Temperature (K) Method Reference Comment
|
|
requests.extend(
|
|
self.parse_generic_data(tables))
|
|
elif len(tables.xpath('tr[1]/th')) == 4:
|
|
log.msg('NIST table: generic 4 columns', level=log.DEBUG)
|
|
# Symbol (unit) Temperature (K) Reference Comment
|
|
requests.extend(
|
|
self.parse_generic_data(tables))
|
|
else:
|
|
log.msg('NIST table: NOT SUPPORTED', level=log.WARNING)
|
|
continue #Assume unsupported
|
|
return requests
|
|
|
|
@staticmethod
|
|
def parse_aggregate_data(table, symbol_table):
|
|
results = []
|
|
for tr in table.xpath('tr[td]'):
|
|
data = []
|
|
for td in tr.xpath('td'):
|
|
data.append(''.join(td.xpath('node()').extract()))
|
|
result = Result({
|
|
'attribute': symbol_table[data[0]],
|
|
'value': data[1] + ' ' + data[2],
|
|
'source': 'NIST',
|
|
'reliability': 'Unknown',
|
|
'conditions': ''
|
|
})
|
|
log.msg('NIST: |%s|' % data, level=log.DEBUG)
|
|
results.append(result)
|
|
return results
|
|
|
|
@staticmethod
|
|
def parse_transition_data(table, symbol_table):
|
|
results = []
|
|
|
|
name = table.xpath('@summary').extract()[0]
|
|
tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract())
|
|
m = re.search(r'\((.*)\)', tr_unit)
|
|
unit = '!'
|
|
if m:
|
|
unit = m.group(1)
|
|
|
|
for tr in table.xpath('tr[td]'):
|
|
tds = tr.xpath('td/text()').extract()
|
|
result = Result({
|
|
'attribute': name,
|
|
'value': tds[0] + ' ' + unit,
|
|
'source': 'NIST',
|
|
'reliability': 'Unknown',
|
|
'conditions': '%s K, (%s -> %s)' % (tds[1], tds[2], tds[3])
|
|
})
|
|
results.append(result)
|
|
|
|
|
|
return results
|
|
|
|
@staticmethod
|
|
def parse_generic_data(table):
|
|
results = []
|
|
|
|
name = table.xpath('@summary').extract()[0]
|
|
tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract())
|
|
m = re.search(r'\((.*)\)', tr_unit)
|
|
unit = '!'
|
|
if m:
|
|
unit = m.group(1)
|
|
|
|
for tr in table.xpath('tr[td]'):
|
|
tds = tr.xpath('td/text()').extract()
|
|
result = Result({
|
|
'attribute': name,
|
|
'value': tds[0] + ' ' + unit,
|
|
'source': 'NIST',
|
|
'reliability': 'Unknown',
|
|
'conditions': '%s K' % tds[1]
|
|
})
|
|
results.append(result)
|
|
return results
|
|
|
|
@staticmethod
|
|
def parse_antoine_data(table):
|
|
results = []
|
|
|
|
name = table.xpath('@summary').extract()[0]
|
|
|
|
for tr in table.xpath('tr[td]'):
|
|
tds = tr.xpath('td/text()').extract()
|
|
result = Result({
|
|
'attribute': name,
|
|
'value': 'A=%s, B=%s, C=%s' % (tds[1], tds[2], tds[3]),
|
|
'source': 'NIST',
|
|
'reliability': 'Unknown',
|
|
'conditions': '%s K' % tds[0]
|
|
})
|
|
results.append(result)
|
|
|
|
return results
|
|
|
|
def new_compound_request(self, compound):
|
|
return Request(url=self.website[:-1] + self.search % compound,
|
|
callback=self.parse)
|