Archived
1
0
This repository has been archived on 2025-03-03. You can view files and clone it, but cannot push or open issues or pull requests.
Fourmi/FourmiCrawler/spider.py

45 lines
1.4 KiB
Python

from scrapy.spider import Spider
from scrapy import log
import re
class FourmiSpider(Spider):
name = "FourmiSpider"
__parsers = []
synonyms = []
def __init__(self, compounds=None, *args, **kwargs):
super(FourmiSpider, self).__init__(*args, **kwargs)
if isinstance(compounds, list):
self.synonyms.extend(compounds)
else:
self.synonyms.append(compounds)
def parse(self, reponse):
for parser in self.__parsers:
if re.match(parser.website, reponse.url):
log.msg("Url: " + reponse.url + " -> Parser: " + parser.website, level=log.DEBUG)
return parser.parse(reponse)
return None
def get_synonym_requests(self, compound):
requests = []
for parser in self.__parsers:
parser_requests = parser.new_compound_request(compound)
if parser_requests is not None:
requests.append(parser_requests)
return requests
def start_requests(self):
requests = []
for synonym in self.synonyms:
requests.extend(self.get_synonym_requests(synonym))
return requests
def add_parsers(self, parsers):
for parser in parsers:
self.add_parser(parser)
def add_parser(self, parser):
self.__parsers.append(parser)
parser.set_spider(self)