diff --git a/FourmiCrawler/sources/ChemSpider.py b/FourmiCrawler/sources/ChemSpider.py index 42abc22..a62f6dd 100644 --- a/FourmiCrawler/sources/ChemSpider.py +++ b/FourmiCrawler/sources/ChemSpider.py @@ -1,4 +1,4 @@ -from source import Parser +from source import Source from scrapy import log from scrapy.http import Request from scrapy.selector import Selector @@ -8,7 +8,7 @@ import re # [TODO] - Maybe clean up usage of '.extract()[0]', because of possible IndexError exception. -class ChemSpider(Parser): +class ChemSpider(Source): """ChemSpider scraper for synonyms and properties This parser will manage searching for chemicals through the @@ -18,7 +18,7 @@ class ChemSpider(Parser): """ def __init__(self): - Parser.__init__(self) + Source.__init__(self) website = 'http://www.chemspider.com/*' diff --git a/FourmiCrawler/sources/source.py b/FourmiCrawler/sources/source.py index feb4535..3c51724 100644 --- a/FourmiCrawler/sources/source.py +++ b/FourmiCrawler/sources/source.py @@ -2,7 +2,7 @@ from scrapy import log # from scrapy.http import Request -class Parser: +class Source: website = "http://something/*" # Regex of URI's the source is able to parse _spider = None diff --git a/FourmiCrawler/spider.py b/FourmiCrawler/spider.py index 77b2c11..9f92a84 100644 --- a/FourmiCrawler/spider.py +++ b/FourmiCrawler/spider.py @@ -15,7 +15,7 @@ class FourmiSpider(Spider): def parse(self, reponse): for parser in self.__parsers: if re.match(parser.website, reponse.url): - log.msg("Url: " + reponse.url + " -> Parser: " + parser.website, level=log.DEBUG) + log.msg("Url: " + reponse.url + " -> Source: " + parser.website, level=log.DEBUG) return parser.parse(reponse) return None diff --git a/sourceloader.py b/sourceloader.py index f380721..9957a70 100644 --- a/sourceloader.py +++ b/sourceloader.py @@ -1,7 +1,7 @@ import inspect import os import re -from FourmiCrawler.sources.source import Parser +from FourmiCrawler.sources.source import Source class SourceLoader: @@ -16,7 +16,7 @@ class SourceLoader: mod = __import__('.'.join([rel_dir.replace("/", "."), py]), fromlist=[py]) classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))] for cls in classes: - if issubclass(cls, Parser) and cls not in known_parser: + if issubclass(cls, Source) and cls not in known_parser: self.sources.append(cls()) # [review] - Would we ever need arguments for the parsers? known_parser.add(cls)