Archived
1
0

Merge branch 'hotfix/No_TABs' into develop

This commit is contained in:
Jip J. Dekker 2014-04-02 14:22:13 +02:00
commit 3a074467e6
4 changed files with 66 additions and 66 deletions

View File

@ -13,33 +13,33 @@ from scrapy.utils.project import get_project_settings
import os, inspect, re import os, inspect, re
def load_parsers(rel_dir="FourmiCrawler/parsers"): def load_parsers(rel_dir="FourmiCrawler/parsers"):
path = os.path.dirname(os.path.abspath(__file__)) path = os.path.dirname(os.path.abspath(__file__))
path += "/" + rel_dir path += "/" + rel_dir
parsers = [] parsers = []
for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']: for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']:
mod = __import__('.'.join([rel_dir.replace("/", "."), py]), fromlist=[py]) mod = __import__('.'.join([rel_dir.replace("/", "."), py]), fromlist=[py])
classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))] classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))]
for cls in classes: for cls in classes:
if re.match(path + "/*", inspect.getfile(cls)): if re.match(path + "/*", inspect.getfile(cls)):
parsers.append(cls()) # [review] - Would we ever need arguments for the parsers? parsers.append(cls()) # [review] - Would we ever need arguments for the parsers?
return parsers return parsers
def setup_crawler(searchables): def setup_crawler(searchables):
spider = FourmiSpider(compounds=searchables) spider = FourmiSpider(compounds=searchables)
spider.add_parsers(load_parsers()) spider.add_parsers(load_parsers())
settings = get_project_settings() settings = get_project_settings()
crawler = Crawler(settings) crawler = Crawler(settings)
crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
crawler.configure() crawler.configure()
crawler.crawl(spider) crawler.crawl(spider)
crawler.start() crawler.start()
def start(): def start():
setup_crawler(["Methane"]) setup_crawler(["Methane"])
log.start() log.start()
reactor.run() reactor.run()
start() start()

View File

@ -3,19 +3,19 @@ from scrapy import log
class Parser: class Parser:
''' '''
website should be an regular expression of the urls of request the parser is able to parse. website should be an regular expression of the urls of request the parser is able to parse.
''' '''
website = "http://something/*" website = "http://something/*"
__spider = None __spider = None
def parse(self, reponse): def parse(self, reponse):
log.msg("The parse function of the empty parser was used.", level=log.WARNING) log.msg("The parse function of the empty parser was used.", level=log.WARNING)
pass pass
def new_compound_request(self, compound): def new_compound_request(self, compound):
# return Request(url=self.website[:-1] + compound, callback=self.parse) # return Request(url=self.website[:-1] + compound, callback=self.parse)
pass pass
def set_spider(self, spider): def set_spider(self, spider):
self.__spider = spider self.__spider = spider

View File

@ -19,7 +19,7 @@ class FourmiPipeline(object):
""" """
value = item['attribute'], item['value'] value = item['attribute'], item['value']
if value in self.known_values: if value in self.known_values:
raise DropItem("Duplicate item found: %s" % item) raise DropItem("Duplicate item found: %s" % item) # #[todo] append sources of first item.
else: else:
self.known_values.add(value) self.known_values.add(value)
return item return item

View File

@ -4,40 +4,40 @@ import re
class FourmiSpider(Spider): class FourmiSpider(Spider):
name = "FourmiSpider" name = "FourmiSpider"
__parsers = [] __parsers = []
synonyms = [] synonyms = []
def __init__(self, compounds=None, *args, **kwargs): def __init__(self, compounds=None, *args, **kwargs):
super(FourmiSpider, self).__init__(*args, **kwargs) super(FourmiSpider, self).__init__(*args, **kwargs)
if isinstance(compounds, list): if isinstance(compounds, list):
self.synonyms.extend(compounds) self.synonyms.extend(compounds)
else: else:
self.synonyms.append(compounds) self.synonyms.append(compounds)
def parse(self, reponse): def parse(self, reponse):
for parser in self.__parsers: for parser in self.__parsers:
if re.match(parser.website, reponse.url): if re.match(parser.website, reponse.url):
log.msg("Url: " + reponse.url + " -> Parser: " + parser.website, level=log.DEBUG) log.msg("Url: " + reponse.url + " -> Parser: " + parser.website, level=log.DEBUG)
return parser.parse(reponse) return parser.parse(reponse)
return None return None
def get_synonym_requests(self, compound): def get_synonym_requests(self, compound):
requests = [] requests = []
for parser in self.__parsers: for parser in self.__parsers:
requests.append(parser.new_compound_request(compound)) requests.append(parser.new_compound_request(compound))
return requests return requests
def start_requests(self): def start_requests(self):
requests = [] requests = []
for synonym in self.synonyms: for synonym in self.synonyms:
requests.extend(self.get_synonym_requests(synonym)) requests.extend(self.get_synonym_requests(synonym))
return requests return requests
def add_parsers(self, parsers): def add_parsers(self, parsers):
for parser in parsers: for parser in parsers:
self.add_parser(parser) self.add_parser(parser)
def add_parser(self, parser): def add_parser(self, parser):
self.__parsers.append(parser) self.__parsers.append(parser)
parser.set_spider(self) parser.set_spider(self)