From b0146cdce8a8f90919c020f178fbb821b1ab61bc Mon Sep 17 00:00:00 2001 From: Bas Vb Date: Tue, 22 Apr 2014 09:46:19 +0200 Subject: [PATCH] Added regular expressions to clean up temperature data --- FourmiCrawler/parsers/WikipediaParser.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/FourmiCrawler/parsers/WikipediaParser.py b/FourmiCrawler/parsers/WikipediaParser.py index 38d42f8..07263a5 100644 --- a/FourmiCrawler/parsers/WikipediaParser.py +++ b/FourmiCrawler/parsers/WikipediaParser.py @@ -47,8 +47,13 @@ class WikipediaParser(Parser): def cleanitems(self, items): for item in items: value=item['value'] - if re.match('3(...)', value): - print value + if re.search('F;\s(\d+[\.,]?\d*)', value): + #print re.search('F;\s(\d+[\.,]?\d*)', value).group(1) + item['value']=re.search('F;\s(\d+[\.,]?\d*)', value).group(1) + " K" + if re.match('(\d+[\.,]?\d*)\sJ\sK.+mol', value): + print item['value'] + item['value']=re.search('(\d+[\.,]?\d*)\sJ\sK.+mol', value).group(1) + " J/K/mol" + print item['value'] return items def getboilingpoint(self, sel):