Archived
1
0
This repository has been archived on 2025-03-03. You can view files and clone it, but cannot push or open issues or pull requests.

153 lines
5.4 KiB
Python

from source import Source
from scrapy import log
from scrapy.http import Request
from scrapy.selector import Selector
from FourmiCrawler.items import Result
import re
class NIST(Source):
website = "http://webbook.nist.gov/*"
search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on'
def __init__(self):
Source.__init__(self)
def parse(self, response):
sel = Selector(response)
requests = []
symbol_table = {}
tds = sel.xpath('//table[@class="symbol_table"]/tr/td')
for (symbol_td, name_td) in zip(tds[::2], tds[1::2]):
symbol = ''.join(symbol_td.xpath('node()').extract())
name = name_td.xpath('text()').extract()[0]
symbol_table[symbol] = name
log.msg('NIST symbol: |%s|, name: |%s|' % (symbol, name),
level=log.DEBUG)
for tables in sel.xpath('//table[@class="data"]'):
if tables.xpath('@summary').extract()[0] == 'One dimensional data':
log.msg('NIST table: Aggregrate data', level=log.DEBUG)
requests.extend(
self.parse_aggregate_data(tables, symbol_table))
elif tables.xpath('tr/th="Initial Phase"').extract()[0] == '1':
log.msg('NIST table; Enthalpy/entropy of phase transition',
level=log.DEBUG)
requests.extend(
self.parse_transition_data(tables, symbol_table))
elif tables.xpath('tr[1]/td'):
log.msg('NIST table: Horizontal table', level=log.DEBUG)
elif (tables.xpath('@summary').extract()[0] ==
'Antoine Equation Parameters'):
log.msg('NIST table: Antoine Equation Parameters',
level=log.DEBUG)
requests.extend(
self.parse_antoine_data(tables))
elif len(tables.xpath('tr[1]/th')) == 5:
log.msg('NIST table: generic 5 columns', level=log.DEBUG)
# Symbol (unit) Temperature (K) Method Reference Comment
requests.extend(
self.parse_generic_data(tables))
elif len(tables.xpath('tr[1]/th')) == 4:
log.msg('NIST table: generic 4 columns', level=log.DEBUG)
# Symbol (unit) Temperature (K) Reference Comment
requests.extend(
self.parse_generic_data(tables))
else:
log.msg('NIST table: NOT SUPPORTED', level=log.WARNING)
continue #Assume unsupported
return requests
@staticmethod
def parse_aggregate_data(table, symbol_table):
results = []
for tr in table.xpath('tr[td]'):
data = []
for td in tr.xpath('td'):
data.append(''.join(td.xpath('node()').extract()))
result = Result({
'attribute': symbol_table[data[0]],
'value': data[1] + ' ' + data[2],
'source': 'NIST',
'reliability': 'Unknown',
'conditions': ''
})
log.msg('NIST: |%s|' % data, level=log.DEBUG)
results.append(result)
return results
@staticmethod
def parse_transition_data(table, symbol_table):
results = []
name = table.xpath('@summary').extract()[0]
tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract())
m = re.search(r'\((.*)\)', tr_unit)
unit = '!'
if m:
unit = m.group(1)
for tr in table.xpath('tr[td]'):
tds = tr.xpath('td/text()').extract()
result = Result({
'attribute': name,
'value': tds[0] + ' ' + unit,
'source': 'NIST',
'reliability': 'Unknown',
'conditions': '%s K, (%s -> %s)' % (tds[1], tds[2], tds[3])
})
log.msg('NIST: |%s|' % result, level=log.DEBUG)
results.append(result)
return results
@staticmethod
def parse_generic_data(table):
results = []
name = table.xpath('@summary').extract()[0]
tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract())
m = re.search(r'\((.*)\)', tr_unit)
unit = '!'
if m:
unit = m.group(1)
for tr in table.xpath('tr[td]'):
tds = tr.xpath('td/text()').extract()
result = Result({
'attribute': name,
'value': tds[0] + ' ' + unit,
'source': 'NIST',
'reliability': 'Unknown',
'conditions': '%s K' % tds[1]
})
log.msg('NIST: |%s|' % result, level=log.DEBUG)
results.append(result)
return results
@staticmethod
def parse_antoine_data(table):
results = []
name = table.xpath('@summary').extract()[0]
for tr in table.xpath('tr[td]'):
tds = tr.xpath('td/text()').extract()
result = Result({
'attribute': name,
'value': 'A=%s, B=%s, C=%s' % (tds[1], tds[2], tds[3]),
'source': 'NIST',
'reliability': 'Unknown',
'conditions': '%s K' % tds[0]
})
results.append(result)
return results
def new_compound_request(self, compound):
return Request(url=self.website[:-1] + self.search % compound,
callback=self.parse)