From 56ee6b1ad347c475ef24f077377c35577b4fbfc5 Mon Sep 17 00:00:00 2001 From: RTB Date: Sat, 17 May 2014 14:09:10 +0200 Subject: [PATCH] added ignore list --- FourmiCrawler/sources/NIST.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py index ddb7a09..17552e5 100644 --- a/FourmiCrawler/sources/NIST.py +++ b/FourmiCrawler/sources/NIST.py @@ -5,11 +5,15 @@ from scrapy.selector import Selector from FourmiCrawler.items import Result import re +# [TODO]: values can be '128.', perhaps remove the dot in that case? + class NIST(Source): website = "http://webbook.nist.gov/*" search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on' + ignore_list = set() + def __init__(self): Source.__init__(self) @@ -235,5 +239,7 @@ class NIST(Source): return results def new_compound_request(self, compound): - return Request(url=self.website[:-1] + self.search % compound, - callback=self.parse) + if compound not in self.ignore_list: + self.ignore_list.update(compound) + return Request(url=self.website[:-1] + self.search % compound, + callback=self.parse)