diff --git a/FourmiCrawler/settings.py b/FourmiCrawler/settings.py index 338f224..e82c8e6 100644 --- a/FourmiCrawler/settings.py +++ b/FourmiCrawler/settings.py @@ -21,7 +21,4 @@ FEED_FORMAT = 'jsonlines' # Crawl responsibly by identifying yourself (and your website) on the # user-agent -# [todo] - Check for repercussions on spoofing the user agent - -# USER_AGENT = 'FourmiCrawler (+http://www.yourdomain.com)' -USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36' +USER_AGENT = 'Fourmi' diff --git a/FourmiCrawler/sources/ChemSpider.py b/FourmiCrawler/sources/ChemSpider.py index 5920b85..b4bf6f0 100644 --- a/FourmiCrawler/sources/ChemSpider.py +++ b/FourmiCrawler/sources/ChemSpider.py @@ -1,3 +1,5 @@ +import re + from scrapy import log from scrapy.http import Request from scrapy.selector import Selector @@ -5,7 +7,6 @@ from scrapy.selector import Selector from source import Source from FourmiCrawler.items import Result -import re # [TODO] - Maybe clean up usage of '.extract()[0]', because of possible IndexError exception. @@ -18,7 +19,7 @@ class ChemSpider(Source): somewhere. """ - website = 'http://www.chemspider.com/*' + website = 'http://www\\.chemspider\\.com/.*' search = 'Search.asmx/SimpleSearch?query=%s&token=' structure = 'Chemical-Structure.%s.html' @@ -276,8 +277,8 @@ class ChemSpider(Source): log.msg('ChemSpider found multiple substances, taking first ' 'element', level=log.DEBUG) csid = csids[0] - structure_url = self.website[:-1] + self.structure % csid - extendedinfo_url = self.website[:-1] + self.extendedinfo % csid + structure_url = self.website[:-2].replace("\\", "") + self.structure % csid + extendedinfo_url = self.website[:-2].replace("\\", "") + self.extendedinfo % csid log.msg('chemspider URL: %s' % structure_url, level=log.DEBUG) return [Request(url=structure_url, callback=self.parse), @@ -292,6 +293,6 @@ class ChemSpider(Source): """ if compound in self.ignore_list or self.cfg['token'] == '': return None - searchurl = self.website[:-1] + self.search % compound + searchurl = self.website[:-2].replace("\\", "") + self.search % compound log.msg('chemspider compound', level=log.DEBUG) return Request(url=searchurl, callback=self.parse_searchrequest) diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py index c136b80..691b062 100644 --- a/FourmiCrawler/sources/NIST.py +++ b/FourmiCrawler/sources/NIST.py @@ -18,7 +18,7 @@ class NIST(Source): This plugin manages searching for a chemical on the NIST website and parsing the resulting page if the chemical exists on NIST. """ - website = "http://webbook.nist.gov/*" + website = "http://webbook\\.nist\\.gov/.*" search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on' @@ -164,7 +164,7 @@ class NIST(Source): extra_data_url = tr.xpath('td[last()][a="Individual data points"]' '/a/@href').extract() if extra_data_url: - request = Request(url=self.website[:-1] + extra_data_url[0], + request = Request(url=self.website[:-2].replace("\\", "") + extra_data_url[0], callback=self.parse_individual_datapoints) results.append(request) continue @@ -329,5 +329,5 @@ class NIST(Source): """ if compound not in self.ignore_list: self.ignore_list.update(compound) - return Request(url=self.website[:-1] + self.search % compound, + return Request(url=self.website[:-2].replace("\\", "") + self.search % compound, callback=self.parse) diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py index fc8250b..15fa3f9 100644 --- a/FourmiCrawler/sources/PubChem.py +++ b/FourmiCrawler/sources/PubChem.py @@ -1,9 +1,11 @@ +import re + from scrapy.http import Request from scrapy import log -from source import Source from scrapy.selector import Selector + +from source import Source from FourmiCrawler.items import Result -import re class PubChem(Source): @@ -14,9 +16,9 @@ class PubChem(Source): """ #PubChem has its data on compound name, properties and their values on different html pages, so different URLs used - website = 'https://*.ncbi.nlm.nih.gov/*' - website_www = 'https://www.ncbi.nlm.nih.gov/*' - website_pubchem = 'https://pubchem.ncbi.nlm.nih.gov/*' + website = 'http://.*\\.ncbi\\.nlm\\.nih\\.gov/.*' + website_www = 'http://www.ncbi.nlm.nih.gov/*' + website_pubchem = 'http://pubchem.ncbi.nlm.nih.gov/.*' search = 'pccompound?term=%s' data_url = 'toc/summary_toc.cgi?tocid=27&cid=%s' @@ -49,14 +51,14 @@ class PubChem(Source): self._spider.get_synonym_requests(synonym) log.msg('Raw synonyms found: %s' % raw_synonyms, level=log.DEBUG) - n = re.search(r'cid=(\d+)',response.url) + n = re.search(r'cid=(\d+)', response.url) if n: cid = n.group(1) log.msg('cid: %s' % cid, level=log.DEBUG) #getting the right id of the compound with which it can reach # the seperate html page which contains the properties and their values #using this cid to get the right url and scrape it - requests.append(Request(url=self.website_pubchem[:-1] + self.data_url % cid, callback=self.parse_data)) + requests.append(Request(url=self.website_pubchem[:-2].replace("\\","") + self.data_url % cid, callback=self.parse_data)) return requests def parse_data(self, response): @@ -106,6 +108,41 @@ class PubChem(Source): return requests + def parse_searchrequest(self, response): + """ + This function parses the response to the new_compound_request Request + :param response: the Response object to be parsed + :return: A Request for the compound page or what self.parse returns in + case the search request forwarded to the compound page + """ + + #check if pubchem forwarded straight to compound page + m = re.match(self.website_pubchem, response.url) + if m: + log.msg('PubChem search forwarded to compound page', + level=log.DEBUG) + return self.parse(response) + + sel = Selector(response) + + results = sel.xpath('//div[@class="rsltcont"]') + if results: + url = results[0].xpath('div/p/a[1]/@href') + else: + log.msg('PubChem search found nothing or xpath failed', + level=log.DEBUG) + return None + + if url: + url = 'http:' + ''.join(url[0].extract()) + log.msg('PubChem compound page: %s' % url, level=log.DEBUG) + else: + log.msg('PubChem search found results, but no url in first result', + level=log.DEBUG) + return None + + return Request(url=url, callback=self.parse) def new_compound_request(self, compound): - return Request(url=self.website_www[:-1] + self.search % compound, callback=self.parse) + return Request(url=self.website_www[:-1] + self.search % compound, + callback=self.parse_searchrequest) diff --git a/FourmiCrawler/sources/WikipediaParser.py b/FourmiCrawler/sources/WikipediaParser.py index 401698c..e27bb39 100644 --- a/FourmiCrawler/sources/WikipediaParser.py +++ b/FourmiCrawler/sources/WikipediaParser.py @@ -15,7 +15,7 @@ class WikipediaParser(Source): It also returns requests with other external sources which contain information on parsed subject. """ - website = "http://en.wikipedia.org/wiki/*" + website = "http://en\\.wikipedia\\.org/wiki/.*" __spider = None searched_compounds = [] @@ -123,7 +123,7 @@ class WikipediaParser(Source): return items def new_compound_request(self, compound): - return Request(url=self.website[:-1] + compound, callback=self.parse) + return Request(url=self.website[:-2].replace("\\", "") + compound, callback=self.parse) @staticmethod def clean_items(items): diff --git a/FourmiCrawler/sources/source.py b/FourmiCrawler/sources/source.py index 36218b0..a0d3dcd 100644 --- a/FourmiCrawler/sources/source.py +++ b/FourmiCrawler/sources/source.py @@ -3,7 +3,7 @@ from scrapy import log class Source: - website = "http://something/*" # Regex of URI's the source is able to parse + website = "http://something/.*" # Regex of URI's the source is able to parse _spider = None def __init__(self, config=None): @@ -30,7 +30,7 @@ class Source: :param compound: A compound name. :return: A new Scrapy Request """ - # return Request(url=self.website[:-1] + compound, callback=self.parse) + # return Request(url=self.website[:-2].replace("\\", "") + compound, callback=self.parse) pass def set_spider(self, spider): diff --git a/FourmiCrawler/spider.py b/FourmiCrawler/spider.py index ebfd2cf..32181ce 100644 --- a/FourmiCrawler/spider.py +++ b/FourmiCrawler/spider.py @@ -34,8 +34,9 @@ class FourmiSpider(Spider): """ for source in self._sources: if re.match(source.website, response.url): - log.msg("Url: " + response.url + " -> Source: " + source.website, level=log.DEBUG) + log.msg("URL: " + response.url + " -> Source: " + source.website, level=log.DEBUG) return source.parse(response) + log.msg("URL: " + response.url + " -> No compatible source", level=log.INFO) return None def get_synonym_requests(self, compound, force=False): diff --git a/utils/configurator.py b/utils/configurator.py index b443529..358adc7 100644 --- a/utils/configurator.py +++ b/utils/configurator.py @@ -1,7 +1,7 @@ import ConfigParser +import os from scrapy.utils.project import get_project_settings -import os class Configurator: """ @@ -67,7 +67,7 @@ class Configurator: :return a ConfigParser object of sources.cfg """ current_dir = os.path.dirname(os.path.abspath(__file__)) - config_path = current_dir + '\..\sources.cfg' + config_path = current_dir + '/../sources.cfg' # [TODO]: location of sources.cfg should be softcoded eventually config = ConfigParser.ConfigParser() config.read(config_path)