From 0cc1b233533a4667175c0f1b2eab9aefc9cca4f2 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 30 Mar 2014 23:37:42 +0200 Subject: [PATCH 1/8] Added the functionality to add parsers and automatically use them. --- Fourmi.py | 26 ++++++++++++++------------ FourmiCrawler/parsers/parser.py | 11 +++++++---- FourmiCrawler/spider.py | 19 ++++++++++++------- 3 files changed, 33 insertions(+), 23 deletions(-) diff --git a/Fourmi.py b/Fourmi.py index 3d54c71..094a5d7 100755 --- a/Fourmi.py +++ b/Fourmi.py @@ -9,23 +9,25 @@ from scrapy.crawler import Crawler from scrapy import log, signals from FourmiCrawler.spider import FourmiSpider from scrapy.utils.project import get_project_settings +from FourmiCrawler.parsers.parser import Parser def setup_crawler(searchable): - # [TODO] - Initiate all parsers for the different websites and get - # allowed URLs. - spider = FourmiSpider(compound=searchable) - settings = get_project_settings() - crawler = Crawler(settings) - crawler.signals.connect(reactor.stop, signal=signals.spider_closed) - crawler.configure() - crawler.crawl(spider) - crawler.start() + # [TODO] - Initiate all parsers for the different websites and get allowed URLs. + spider = FourmiSpider(compound=searchable) + spider.add_parser(Parser()) + settings = get_project_settings() + crawler = Crawler(settings) + crawler.signals.connect(reactor.stop, signal=signals.spider_closed) + crawler.configure() + crawler.crawl(spider) + crawler.start() def start(): - setup_crawler("Methane") - log.start() - reactor.run() + setup_crawler("Methane") + log.start() + reactor.run() + start() diff --git a/FourmiCrawler/parsers/parser.py b/FourmiCrawler/parsers/parser.py index 3362d59..7097ee3 100644 --- a/FourmiCrawler/parsers/parser.py +++ b/FourmiCrawler/parsers/parser.py @@ -2,8 +2,11 @@ from scrapy import log class Parser: - website = "http://localhost/*" + ''' + website should be an regular expression of websites you want to parse. + ''' + website = "http://localhost/*" - def parse(self, reponse): - log.msg("The parse function of the empty parser was used.", level=log.Warning) - pass + def parse(self, reponse): + log.msg("The parse function of the empty parser was used.", level=log.WARNING) + pass \ No newline at end of file diff --git a/FourmiCrawler/spider.py b/FourmiCrawler/spider.py index a08d997..40d6dfc 100644 --- a/FourmiCrawler/spider.py +++ b/FourmiCrawler/spider.py @@ -1,19 +1,24 @@ from scrapy.spider import Spider +from scrapy import log +import re class FourmiSpider(Spider): name = "FourmiSpider" + start_urls = ["http://localhost/"] + parsers = [] def __init__(self, compound=None, *args, **kwargs): super(FourmiSpider, self).__init__(*args, **kwargs) self.synonyms = [compound] - -def parse(self, reponse): - # [TODO] - This function should delegate it's functionality to other - # parsers. - pass + def parse(self, reponse): + for parser in self.parsers: + if re.match(parser.website, reponse.url): + log.msg("Url: " + reponse.url + " -> Parser: " + parser.website, level=log.DEBUG) + return parser.parse(reponse) + return none -def add_parser(self, parser): - self.parsers.add(parser) + def add_parser(self, parser): + self.parsers.append(parser) From 4d9e5307bf0c00f1db07511affd1a7c389efe812 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Mon, 31 Mar 2014 00:48:45 +0200 Subject: [PATCH 2/8] Written an loader for all parsers in the parser directory. --- Fourmi.py | 16 +++++++++++++--- FourmiCrawler/spider.py | 4 +++- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/Fourmi.py b/Fourmi.py index 094a5d7..c411b4a 100755 --- a/Fourmi.py +++ b/Fourmi.py @@ -9,13 +9,23 @@ from scrapy.crawler import Crawler from scrapy import log, signals from FourmiCrawler.spider import FourmiSpider from scrapy.utils.project import get_project_settings -from FourmiCrawler.parsers.parser import Parser +import os, inspect +def load_parsers(rel_dir="FourmiCrawler/parsers"): + path = os.path.dirname(os.path.abspath(__file__)) + path += "/" + rel_dir + parsers = [] + + for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']: + mod = __import__('.'.join(["FourmiCrawler.parsers", py]), fromlist=[py]) # [todo] - This module name should be derived from the rel_dir variable + classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))] + for cls in classes: + parsers.append(cls()) # [review] - Would we ever need arguments for the parsers? + return parsers def setup_crawler(searchable): - # [TODO] - Initiate all parsers for the different websites and get allowed URLs. spider = FourmiSpider(compound=searchable) - spider.add_parser(Parser()) + spider.add_parsers(load_parsers()) settings = get_project_settings() crawler = Crawler(settings) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) diff --git a/FourmiCrawler/spider.py b/FourmiCrawler/spider.py index 40d6dfc..d2711c4 100644 --- a/FourmiCrawler/spider.py +++ b/FourmiCrawler/spider.py @@ -5,7 +5,6 @@ import re class FourmiSpider(Spider): name = "FourmiSpider" - start_urls = ["http://localhost/"] parsers = [] def __init__(self, compound=None, *args, **kwargs): @@ -22,3 +21,6 @@ class FourmiSpider(Spider): def add_parser(self, parser): self.parsers.append(parser) + + def add_parsers(self, parsers): + self.parsers.extend(parsers) From e39ed3b68139fa38449cc8948938b59b0d0ff9f1 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Tue, 1 Apr 2014 20:56:32 +0200 Subject: [PATCH 3/8] Added a way for parsers to access the spider. --- FourmiCrawler/parsers/parser.py | 12 ++++++++++-- FourmiCrawler/spider.py | 13 +++++++------ 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/FourmiCrawler/parsers/parser.py b/FourmiCrawler/parsers/parser.py index 7097ee3..78d9dc1 100644 --- a/FourmiCrawler/parsers/parser.py +++ b/FourmiCrawler/parsers/parser.py @@ -5,8 +5,16 @@ class Parser: ''' website should be an regular expression of websites you want to parse. ''' - website = "http://localhost/*" + website = "http://something/*" + __spider = None def parse(self, reponse): log.msg("The parse function of the empty parser was used.", level=log.WARNING) - pass \ No newline at end of file + pass + + def generate_search_url(self, compound): + # return website[:-1] + compound + pass + + def set_spider(self, spider): + self.__spider = spider diff --git a/FourmiCrawler/spider.py b/FourmiCrawler/spider.py index d2711c4..9b356f8 100644 --- a/FourmiCrawler/spider.py +++ b/FourmiCrawler/spider.py @@ -16,11 +16,12 @@ class FourmiSpider(Spider): if re.match(parser.website, reponse.url): log.msg("Url: " + reponse.url + " -> Parser: " + parser.website, level=log.DEBUG) return parser.parse(reponse) - return none - - - def add_parser(self, parser): - self.parsers.append(parser) + return None def add_parsers(self, parsers): - self.parsers.extend(parsers) + for parser in parsers: + self.add_parser(parser) + + def add_parser(self, parser): + self.parsers.add(parser) + parser.set_spider(self) \ No newline at end of file From f93dc2d1602b00c735236a02d7a7611be57657be Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Tue, 1 Apr 2014 21:07:36 +0200 Subject: [PATCH 4/8] Added an structure to get requests for all websites for a new synonym --- FourmiCrawler/parsers/parser.py | 7 ++++--- FourmiCrawler/spider.py | 11 +++++++++-- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/FourmiCrawler/parsers/parser.py b/FourmiCrawler/parsers/parser.py index 78d9dc1..68f73cf 100644 --- a/FourmiCrawler/parsers/parser.py +++ b/FourmiCrawler/parsers/parser.py @@ -1,9 +1,10 @@ from scrapy import log +from scrapy.http import Request class Parser: ''' - website should be an regular expression of websites you want to parse. + website should be an regular expression of the urls of request the parser is able to parse. ''' website = "http://something/*" __spider = None @@ -12,8 +13,8 @@ class Parser: log.msg("The parse function of the empty parser was used.", level=log.WARNING) pass - def generate_search_url(self, compound): - # return website[:-1] + compound + def new_compound_request(self, compound): + # return Request(url=self.website[:-1] + compound, callable=self.parse) pass def set_spider(self, spider): diff --git a/FourmiCrawler/spider.py b/FourmiCrawler/spider.py index 9b356f8..edd74a9 100644 --- a/FourmiCrawler/spider.py +++ b/FourmiCrawler/spider.py @@ -5,7 +5,7 @@ import re class FourmiSpider(Spider): name = "FourmiSpider" - parsers = [] + __parsers = [] def __init__(self, compound=None, *args, **kwargs): super(FourmiSpider, self).__init__(*args, **kwargs) @@ -18,10 +18,17 @@ class FourmiSpider(Spider): return parser.parse(reponse) return None + def get_synonym_requests(self, compound): + requests = [] + for parser in self.parsers: + requests.append(parser.new_compound_request(compound)) + return requests + + def add_parsers(self, parsers): for parser in parsers: self.add_parser(parser) def add_parser(self, parser): - self.parsers.add(parser) + self.__parsers.add(parser) parser.set_spider(self) \ No newline at end of file From 683f8c09d44888eb165af1e9a738067a8ff621ea Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Tue, 1 Apr 2014 21:12:54 +0200 Subject: [PATCH 5/8] Quick fix, python errors --- Fourmi.py | 2 +- FourmiCrawler/parsers/parser.py | 2 +- FourmiCrawler/spider.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Fourmi.py b/Fourmi.py index c411b4a..a71400c 100755 --- a/Fourmi.py +++ b/Fourmi.py @@ -18,7 +18,7 @@ def load_parsers(rel_dir="FourmiCrawler/parsers"): for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']: mod = __import__('.'.join(["FourmiCrawler.parsers", py]), fromlist=[py]) # [todo] - This module name should be derived from the rel_dir variable - classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))] + classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))] # [fix] - This also finds classes that are imported. for cls in classes: parsers.append(cls()) # [review] - Would we ever need arguments for the parsers? return parsers diff --git a/FourmiCrawler/parsers/parser.py b/FourmiCrawler/parsers/parser.py index 68f73cf..a3710c5 100644 --- a/FourmiCrawler/parsers/parser.py +++ b/FourmiCrawler/parsers/parser.py @@ -1,5 +1,5 @@ from scrapy import log -from scrapy.http import Request +# from scrapy.http import Request class Parser: diff --git a/FourmiCrawler/spider.py b/FourmiCrawler/spider.py index edd74a9..3fc5ce0 100644 --- a/FourmiCrawler/spider.py +++ b/FourmiCrawler/spider.py @@ -30,5 +30,5 @@ class FourmiSpider(Spider): self.add_parser(parser) def add_parser(self, parser): - self.__parsers.add(parser) + self.__parsers.append(parser) parser.set_spider(self) \ No newline at end of file From 0bf2d102c6ca7c6db2fb035a3774fe032155fa8e Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Tue, 1 Apr 2014 21:21:30 +0200 Subject: [PATCH 6/8] Fixed parser importation, so it doesn't import imported classes. --- Fourmi.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Fourmi.py b/Fourmi.py index a71400c..2bed5cc 100755 --- a/Fourmi.py +++ b/Fourmi.py @@ -9,7 +9,7 @@ from scrapy.crawler import Crawler from scrapy import log, signals from FourmiCrawler.spider import FourmiSpider from scrapy.utils.project import get_project_settings -import os, inspect +import os, inspect, re def load_parsers(rel_dir="FourmiCrawler/parsers"): path = os.path.dirname(os.path.abspath(__file__)) @@ -18,9 +18,10 @@ def load_parsers(rel_dir="FourmiCrawler/parsers"): for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']: mod = __import__('.'.join(["FourmiCrawler.parsers", py]), fromlist=[py]) # [todo] - This module name should be derived from the rel_dir variable - classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))] # [fix] - This also finds classes that are imported. + classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))] for cls in classes: - parsers.append(cls()) # [review] - Would we ever need arguments for the parsers? + if re.match(path + "/*", inspect.getfile(cls)): + parsers.append(cls()) # [review] - Would we ever need arguments for the parsers? return parsers def setup_crawler(searchable): From cd421cc2fbf02e702b1a7fcf3db03c94cac77d30 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Tue, 1 Apr 2014 21:24:04 +0200 Subject: [PATCH 7/8] Replaced literal for testing with a variable fix. --- Fourmi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Fourmi.py b/Fourmi.py index 2bed5cc..015ae13 100755 --- a/Fourmi.py +++ b/Fourmi.py @@ -17,7 +17,7 @@ def load_parsers(rel_dir="FourmiCrawler/parsers"): parsers = [] for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']: - mod = __import__('.'.join(["FourmiCrawler.parsers", py]), fromlist=[py]) # [todo] - This module name should be derived from the rel_dir variable + mod = __import__('.'.join([rel_dir.replace("/", "."), py]), fromlist=[py]) classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))] for cls in classes: if re.match(path + "/*", inspect.getfile(cls)): From 7bc160f67623dc382003680221fbd74d256441aa Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Tue, 1 Apr 2014 21:38:11 +0200 Subject: [PATCH 8/8] The spider is now able to start using the synonym generator --- Fourmi.py | 6 +++--- FourmiCrawler/parsers/parser.py | 2 +- FourmiCrawler/spider.py | 17 +++++++++++++---- 3 files changed, 17 insertions(+), 8 deletions(-) diff --git a/Fourmi.py b/Fourmi.py index 015ae13..1a3e11b 100755 --- a/Fourmi.py +++ b/Fourmi.py @@ -24,8 +24,8 @@ def load_parsers(rel_dir="FourmiCrawler/parsers"): parsers.append(cls()) # [review] - Would we ever need arguments for the parsers? return parsers -def setup_crawler(searchable): - spider = FourmiSpider(compound=searchable) +def setup_crawler(searchables): + spider = FourmiSpider(compounds=searchables) spider.add_parsers(load_parsers()) settings = get_project_settings() crawler = Crawler(settings) @@ -36,7 +36,7 @@ def setup_crawler(searchable): def start(): - setup_crawler("Methane") + setup_crawler(["Methane"]) log.start() reactor.run() diff --git a/FourmiCrawler/parsers/parser.py b/FourmiCrawler/parsers/parser.py index a3710c5..8499fea 100644 --- a/FourmiCrawler/parsers/parser.py +++ b/FourmiCrawler/parsers/parser.py @@ -14,7 +14,7 @@ class Parser: pass def new_compound_request(self, compound): - # return Request(url=self.website[:-1] + compound, callable=self.parse) + # return Request(url=self.website[:-1] + compound, callback=self.parse) pass def set_spider(self, spider): diff --git a/FourmiCrawler/spider.py b/FourmiCrawler/spider.py index 3fc5ce0..4d6b897 100644 --- a/FourmiCrawler/spider.py +++ b/FourmiCrawler/spider.py @@ -6,13 +6,17 @@ import re class FourmiSpider(Spider): name = "FourmiSpider" __parsers = [] + synonyms = [] - def __init__(self, compound=None, *args, **kwargs): + def __init__(self, compounds=None, *args, **kwargs): super(FourmiSpider, self).__init__(*args, **kwargs) - self.synonyms = [compound] + if isinstance(compounds, list): + self.synonyms.extend(compounds) + else: + self.synonyms.append(compounds) def parse(self, reponse): - for parser in self.parsers: + for parser in self.__parsers: if re.match(parser.website, reponse.url): log.msg("Url: " + reponse.url + " -> Parser: " + parser.website, level=log.DEBUG) return parser.parse(reponse) @@ -20,10 +24,15 @@ class FourmiSpider(Spider): def get_synonym_requests(self, compound): requests = [] - for parser in self.parsers: + for parser in self.__parsers: requests.append(parser.new_compound_request(compound)) return requests + def start_requests(self): + requests = [] + for synonym in self.synonyms: + requests.extend(self.get_synonym_requests(synonym)) + return requests def add_parsers(self, parsers): for parser in parsers: