From 662ee8f49013aa07ded2e3b89216a135a90b4f59 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Wed, 23 Apr 2014 15:49:03 +0200 Subject: [PATCH 1/4] Renamed folder --- FourmiCrawler/{parsers => sources}/ChemSpider.py | 0 FourmiCrawler/{parsers => sources}/__init__.py | 0 FourmiCrawler/{parsers => sources}/parser.py | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename FourmiCrawler/{parsers => sources}/ChemSpider.py (100%) rename FourmiCrawler/{parsers => sources}/__init__.py (100%) rename FourmiCrawler/{parsers => sources}/parser.py (100%) diff --git a/FourmiCrawler/parsers/ChemSpider.py b/FourmiCrawler/sources/ChemSpider.py similarity index 100% rename from FourmiCrawler/parsers/ChemSpider.py rename to FourmiCrawler/sources/ChemSpider.py diff --git a/FourmiCrawler/parsers/__init__.py b/FourmiCrawler/sources/__init__.py similarity index 100% rename from FourmiCrawler/parsers/__init__.py rename to FourmiCrawler/sources/__init__.py diff --git a/FourmiCrawler/parsers/parser.py b/FourmiCrawler/sources/parser.py similarity index 100% rename from FourmiCrawler/parsers/parser.py rename to FourmiCrawler/sources/parser.py From 1e24453a1152417a2636c042ff9ee61660f31a03 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Wed, 23 Apr 2014 15:51:03 +0200 Subject: [PATCH 2/4] Renamed filename of basic source class --- FourmiCrawler/sources/{parser.py => source.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename FourmiCrawler/sources/{parser.py => source.py} (100%) diff --git a/FourmiCrawler/sources/parser.py b/FourmiCrawler/sources/source.py similarity index 100% rename from FourmiCrawler/sources/parser.py rename to FourmiCrawler/sources/source.py From e18e4b4b26b559e9e605065073d260a87756f408 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Wed, 23 Apr 2014 15:55:38 +0200 Subject: [PATCH 3/4] Resolved all references to the old folder --- FourmiCrawler/sources/ChemSpider.py | 4 ++-- sourceloader.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/FourmiCrawler/sources/ChemSpider.py b/FourmiCrawler/sources/ChemSpider.py index 3273107..42abc22 100644 --- a/FourmiCrawler/sources/ChemSpider.py +++ b/FourmiCrawler/sources/ChemSpider.py @@ -1,4 +1,4 @@ -from parser import Parser +from source import Parser from scrapy import log from scrapy.http import Request from scrapy.selector import Selector @@ -18,7 +18,7 @@ class ChemSpider(Parser): """ def __init__(self): - pass + Parser.__init__(self) website = 'http://www.chemspider.com/*' diff --git a/sourceloader.py b/sourceloader.py index 2eff6c1..f380721 100644 --- a/sourceloader.py +++ b/sourceloader.py @@ -1,13 +1,13 @@ import inspect import os import re -from FourmiCrawler.parsers.parser import Parser +from FourmiCrawler.sources.source import Parser class SourceLoader: sources = [] - def __init__(self, rel_dir="FourmiCrawler/parsers"): + def __init__(self, rel_dir="FourmiCrawler/sources"): path = os.path.dirname(os.path.abspath(__file__)) path += "/" + rel_dir known_parser = set() From 90f03734a66235eb0bac9b4a7e9366ca6b2008fc Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Wed, 23 Apr 2014 15:57:10 +0200 Subject: [PATCH 4/4] Refractored classname --- FourmiCrawler/sources/ChemSpider.py | 6 +++--- FourmiCrawler/sources/source.py | 2 +- FourmiCrawler/spider.py | 2 +- sourceloader.py | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/FourmiCrawler/sources/ChemSpider.py b/FourmiCrawler/sources/ChemSpider.py index 42abc22..a62f6dd 100644 --- a/FourmiCrawler/sources/ChemSpider.py +++ b/FourmiCrawler/sources/ChemSpider.py @@ -1,4 +1,4 @@ -from source import Parser +from source import Source from scrapy import log from scrapy.http import Request from scrapy.selector import Selector @@ -8,7 +8,7 @@ import re # [TODO] - Maybe clean up usage of '.extract()[0]', because of possible IndexError exception. -class ChemSpider(Parser): +class ChemSpider(Source): """ChemSpider scraper for synonyms and properties This parser will manage searching for chemicals through the @@ -18,7 +18,7 @@ class ChemSpider(Parser): """ def __init__(self): - Parser.__init__(self) + Source.__init__(self) website = 'http://www.chemspider.com/*' diff --git a/FourmiCrawler/sources/source.py b/FourmiCrawler/sources/source.py index feb4535..3c51724 100644 --- a/FourmiCrawler/sources/source.py +++ b/FourmiCrawler/sources/source.py @@ -2,7 +2,7 @@ from scrapy import log # from scrapy.http import Request -class Parser: +class Source: website = "http://something/*" # Regex of URI's the source is able to parse _spider = None diff --git a/FourmiCrawler/spider.py b/FourmiCrawler/spider.py index 77b2c11..9f92a84 100644 --- a/FourmiCrawler/spider.py +++ b/FourmiCrawler/spider.py @@ -15,7 +15,7 @@ class FourmiSpider(Spider): def parse(self, reponse): for parser in self.__parsers: if re.match(parser.website, reponse.url): - log.msg("Url: " + reponse.url + " -> Parser: " + parser.website, level=log.DEBUG) + log.msg("Url: " + reponse.url + " -> Source: " + parser.website, level=log.DEBUG) return parser.parse(reponse) return None diff --git a/sourceloader.py b/sourceloader.py index f380721..9957a70 100644 --- a/sourceloader.py +++ b/sourceloader.py @@ -1,7 +1,7 @@ import inspect import os import re -from FourmiCrawler.sources.source import Parser +from FourmiCrawler.sources.source import Source class SourceLoader: @@ -16,7 +16,7 @@ class SourceLoader: mod = __import__('.'.join([rel_dir.replace("/", "."), py]), fromlist=[py]) classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))] for cls in classes: - if issubclass(cls, Parser) and cls not in known_parser: + if issubclass(cls, Source) and cls not in known_parser: self.sources.append(cls()) # [review] - Would we ever need arguments for the parsers? known_parser.add(cls)