From f6981057df603edb819712c5e799d824dd8a8c71 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Wed, 2 Apr 2014 14:20:05 +0200 Subject: [PATCH] Changed everything to spaces --- Fourmi.py | 42 +++++++++++----------- FourmiCrawler/parsers/parser.py | 26 +++++++------- FourmiCrawler/pipelines.py | 2 +- FourmiCrawler/spider.py | 62 ++++++++++++++++----------------- 4 files changed, 66 insertions(+), 66 deletions(-) diff --git a/Fourmi.py b/Fourmi.py index 1f076ea..a0f54a4 100755 --- a/Fourmi.py +++ b/Fourmi.py @@ -13,33 +13,33 @@ from scrapy.utils.project import get_project_settings import os, inspect, re def load_parsers(rel_dir="FourmiCrawler/parsers"): - path = os.path.dirname(os.path.abspath(__file__)) - path += "/" + rel_dir - parsers = [] + path = os.path.dirname(os.path.abspath(__file__)) + path += "/" + rel_dir + parsers = [] - for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']: - mod = __import__('.'.join([rel_dir.replace("/", "."), py]), fromlist=[py]) - classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))] - for cls in classes: - if re.match(path + "/*", inspect.getfile(cls)): - parsers.append(cls()) # [review] - Would we ever need arguments for the parsers? - return parsers + for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']: + mod = __import__('.'.join([rel_dir.replace("/", "."), py]), fromlist=[py]) + classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))] + for cls in classes: + if re.match(path + "/*", inspect.getfile(cls)): + parsers.append(cls()) # [review] - Would we ever need arguments for the parsers? + return parsers def setup_crawler(searchables): - spider = FourmiSpider(compounds=searchables) - spider.add_parsers(load_parsers()) - settings = get_project_settings() - crawler = Crawler(settings) - crawler.signals.connect(reactor.stop, signal=signals.spider_closed) - crawler.configure() - crawler.crawl(spider) - crawler.start() + spider = FourmiSpider(compounds=searchables) + spider.add_parsers(load_parsers()) + settings = get_project_settings() + crawler = Crawler(settings) + crawler.signals.connect(reactor.stop, signal=signals.spider_closed) + crawler.configure() + crawler.crawl(spider) + crawler.start() def start(): - setup_crawler(["Methane"]) - log.start() - reactor.run() + setup_crawler(["Methane"]) + log.start() + reactor.run() start() diff --git a/FourmiCrawler/parsers/parser.py b/FourmiCrawler/parsers/parser.py index 8499fea..cac5019 100644 --- a/FourmiCrawler/parsers/parser.py +++ b/FourmiCrawler/parsers/parser.py @@ -3,19 +3,19 @@ from scrapy import log class Parser: - ''' - website should be an regular expression of the urls of request the parser is able to parse. - ''' - website = "http://something/*" - __spider = None + ''' + website should be an regular expression of the urls of request the parser is able to parse. + ''' + website = "http://something/*" + __spider = None - def parse(self, reponse): - log.msg("The parse function of the empty parser was used.", level=log.WARNING) - pass + def parse(self, reponse): + log.msg("The parse function of the empty parser was used.", level=log.WARNING) + pass - def new_compound_request(self, compound): - # return Request(url=self.website[:-1] + compound, callback=self.parse) - pass + def new_compound_request(self, compound): + # return Request(url=self.website[:-1] + compound, callback=self.parse) + pass - def set_spider(self, spider): - self.__spider = spider + def set_spider(self, spider): + self.__spider = spider diff --git a/FourmiCrawler/pipelines.py b/FourmiCrawler/pipelines.py index 3194d7e..5f2b68f 100644 --- a/FourmiCrawler/pipelines.py +++ b/FourmiCrawler/pipelines.py @@ -19,7 +19,7 @@ class FourmiPipeline(object): """ value = item['attribute'], item['value'] if value in self.known_values: - raise DropItem("Duplicate item found: %s" % item) + raise DropItem("Duplicate item found: %s" % item) # #[todo] append sources of first item. else: self.known_values.add(value) return item diff --git a/FourmiCrawler/spider.py b/FourmiCrawler/spider.py index 4d6b897..42f1e15 100644 --- a/FourmiCrawler/spider.py +++ b/FourmiCrawler/spider.py @@ -4,40 +4,40 @@ import re class FourmiSpider(Spider): - name = "FourmiSpider" - __parsers = [] - synonyms = [] + name = "FourmiSpider" + __parsers = [] + synonyms = [] - def __init__(self, compounds=None, *args, **kwargs): - super(FourmiSpider, self).__init__(*args, **kwargs) - if isinstance(compounds, list): - self.synonyms.extend(compounds) - else: - self.synonyms.append(compounds) + def __init__(self, compounds=None, *args, **kwargs): + super(FourmiSpider, self).__init__(*args, **kwargs) + if isinstance(compounds, list): + self.synonyms.extend(compounds) + else: + self.synonyms.append(compounds) - def parse(self, reponse): - for parser in self.__parsers: - if re.match(parser.website, reponse.url): - log.msg("Url: " + reponse.url + " -> Parser: " + parser.website, level=log.DEBUG) - return parser.parse(reponse) - return None + def parse(self, reponse): + for parser in self.__parsers: + if re.match(parser.website, reponse.url): + log.msg("Url: " + reponse.url + " -> Parser: " + parser.website, level=log.DEBUG) + return parser.parse(reponse) + return None - def get_synonym_requests(self, compound): - requests = [] - for parser in self.__parsers: - requests.append(parser.new_compound_request(compound)) - return requests + def get_synonym_requests(self, compound): + requests = [] + for parser in self.__parsers: + requests.append(parser.new_compound_request(compound)) + return requests - def start_requests(self): - requests = [] - for synonym in self.synonyms: - requests.extend(self.get_synonym_requests(synonym)) - return requests + def start_requests(self): + requests = [] + for synonym in self.synonyms: + requests.extend(self.get_synonym_requests(synonym)) + return requests - def add_parsers(self, parsers): - for parser in parsers: - self.add_parser(parser) + def add_parsers(self, parsers): + for parser in parsers: + self.add_parser(parser) - def add_parser(self, parser): - self.__parsers.append(parser) - parser.set_spider(self) \ No newline at end of file + def add_parser(self, parser): + self.__parsers.append(parser) + parser.set_spider(self) \ No newline at end of file