From f728dff6b09614f98b51b756c3bbd4b7f3cda12f Mon Sep 17 00:00:00 2001 From: Nout van Deijck Date: Wed, 14 May 2014 12:01:05 +0200 Subject: [PATCH 01/13] Developing PubChem parser, first draft, not tested nor finished completely --- FourmiCrawler/sources/PubChem.py | 84 ++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 FourmiCrawler/sources/PubChem.py diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py new file mode 100644 index 0000000..00b2cd7 --- /dev/null +++ b/FourmiCrawler/sources/PubChem.py @@ -0,0 +1,84 @@ +from scrapy.http import Request +from scrapy import log +from source import Source +from scrapy.selector import Selector +from FourmiCrawler.items import Result +import re + + +class PubChem(Source): + """ PubChem scraper for chemical properties + + This parser parses the part on PubChem pages that gives Chemical and Physical properties of a substance. + """ + + # TO DO: make url variable with help of PubChem identifier ID given by Wikipedia + + #website = "https://pubchem.ncbi.nlm.nih.gov/summary/summary.cgi?cid=297" #contains name of compound but not all parsable data + website = "https://pubchem.ncbi.nlm.nih.gov/toc/summary_toc.cgi?tocid=27&cid=297" #contains properties to parse + + __spider = None + searched_compounds = [] + + def __init__(self): + Source.__init__(self) + + def parse(self, response): + """ Distributes the above described behaviour """ + log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG) + sel = Selector(response) + compound = sel.xpath('//h1/text()').extract()[0] + if compound in self.searched_compounds: + return None + else: + items = self.parse_properties(sel) + self.searched_compounds.append(compound) + return items + + def parse_properties(self, sel): + """ scrape data from 'Chemical and Physical Properties' box on PubChem. """ + items = [] + + + prop_names = sel.xpath('.//div[@id="d27"//div/b').\ + xpath('normalize-space(string())') + prop_values = sel.xpath('.//div[@id="d27"//div/a').\ + xpath('normalize-space(string())') + prop_sources = sel.xpath('.//div[@id="d27"//div/a[@title]').\ + xpath('normalize-space(string())') + + for i, prop_name in enumerate(prop_names): + item = Result({ + 'attribute': prop_name.extract().encode('utf-8'), + 'value': prop_values[i].extract().encode('utf-8'), + 'source': "PubChem: " + prop_sources[i].extract().encode('utf-8'), + 'reliability': "", + 'conditions': "" + }) + items.append(item) + + print item + + log.msg('PubChem prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG) + + items = filter(lambda a: a['value'] != '', items) # remove items with an empty value + # item_list = self.clean_items(items) + + + return items + + def new_compound_request(self, compound): + return Request(url=self.website[:-1] + compound, callback=self.parse) + + # @staticmethod + # def clean_items(items): + # """ clean up properties using regex, makes it possible to split the values from the units """ + # for item in items: + # value = item['value'] + # m = re.search('F;\s(\d+[\.,]?\d*)', value) # clean up numerical Kelvin value (after F) + # if m: + # item['value'] = m.group(1) + " K" + # m = re.match('(\d+[\.,]?\d*)\sJ\sK.+mol', value) # clean up J/K/mol values + # if m: + # item['value'] = m.group(1) + " J/K/mol" + # return items From 84f2e3dbea9a2f137bf7c441bb347313cccdf11d Mon Sep 17 00:00:00 2001 From: Nout van Deijck Date: Wed, 21 May 2014 14:53:51 +0200 Subject: [PATCH 02/13] Testing search function PubChem --- FourmiCrawler/sources/PubChem.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py index 00b2cd7..d34a2cb 100644 --- a/FourmiCrawler/sources/PubChem.py +++ b/FourmiCrawler/sources/PubChem.py @@ -12,10 +12,16 @@ class PubChem(Source): This parser parses the part on PubChem pages that gives Chemical and Physical properties of a substance. """ - # TO DO: make url variable with help of PubChem identifier ID given by Wikipedia + # TO DO: make url variable with help of PubChem identifier ID / cid #website = "https://pubchem.ncbi.nlm.nih.gov/summary/summary.cgi?cid=297" #contains name of compound but not all parsable data - website = "https://pubchem.ncbi.nlm.nih.gov/toc/summary_toc.cgi?tocid=27&cid=297" #contains properties to parse + # website = "https://pubchem.ncbi.nlm.nih.gov/toc/summary_toc.cgi?tocid=27&cid=297" #contains properties to parse + + + website = 'https://www.ncbi.nlm.nih.gov/*' + + + search = 'pccompound?term=%s' __spider = None searched_compounds = [] @@ -31,8 +37,10 @@ class PubChem(Source): if compound in self.searched_compounds: return None else: - items = self.parse_properties(sel) + # items = self.parse_properties(sel) + items = [] self.searched_compounds.append(compound) + print items return items def parse_properties(self, sel): @@ -68,7 +76,7 @@ class PubChem(Source): return items def new_compound_request(self, compound): - return Request(url=self.website[:-1] + compound, callback=self.parse) + return Request(url=self.website[:-1] + self.search % compound, callback=self.parse) # @staticmethod # def clean_items(items): From 4b377bb9a966e4b1fd82101e865d70fae0c30b1c Mon Sep 17 00:00:00 2001 From: Nout van Deijck Date: Wed, 21 May 2014 15:25:55 +0200 Subject: [PATCH 03/13] PubChem now scrapes its synonyms --- FourmiCrawler/sources/PubChem.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py index d34a2cb..0ce727f 100644 --- a/FourmiCrawler/sources/PubChem.py +++ b/FourmiCrawler/sources/PubChem.py @@ -19,12 +19,10 @@ class PubChem(Source): website = 'https://www.ncbi.nlm.nih.gov/*' - - search = 'pccompound?term=%s' __spider = None - searched_compounds = [] + searched_compounds = set() def __init__(self): Source.__init__(self) @@ -34,12 +32,21 @@ class PubChem(Source): log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG) sel = Selector(response) compound = sel.xpath('//h1/text()').extract()[0] + raw_synonyms = sel.xpath('//div[@class="smalltext"]/text()').extract()[0] + for synonym in raw_synonyms.strip().split(', '): + log.msg('PubChem synonym found: %s' % synonym, level=log.DEBUG) + self.searched_compounds.update(synonym) + self._spider.get_synonym_requests(synonym) + + + log.msg('Raw synonyms found: %s' % raw_synonyms, level=log.DEBUG) + if compound in self.searched_compounds: return None else: # items = self.parse_properties(sel) items = [] - self.searched_compounds.append(compound) + self.searched_compounds.update(compound) print items return items From fb41d772f203b420784582732ea64fd45d96c51d Mon Sep 17 00:00:00 2001 From: Nout van Deijck Date: Wed, 21 May 2014 16:11:02 +0200 Subject: [PATCH 04/13] Added custom user-agent because otherwise it would block, because not amused by scraper --- FourmiCrawler/settings.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/FourmiCrawler/settings.py b/FourmiCrawler/settings.py index be91fef..490a3a5 100644 --- a/FourmiCrawler/settings.py +++ b/FourmiCrawler/settings.py @@ -16,6 +16,8 @@ ITEM_PIPELINES = { FEED_URI = 'results.json' FEED_FORMAT = 'jsonlines' +USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36' + # Crawl responsibly by identifying yourself (and your website) on the # user-agent From 8083d0c7bc03459de2aab224a811653389aa0ebf Mon Sep 17 00:00:00 2001 From: Nout van Deijck Date: Wed, 21 May 2014 16:11:48 +0200 Subject: [PATCH 05/13] PubChem scrapes synonyms, gets custom url to get data on properties from --- FourmiCrawler/sources/PubChem.py | 40 ++++++++++++++++++++++---------- 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py index 0ce727f..e2dcc8b 100644 --- a/FourmiCrawler/sources/PubChem.py +++ b/FourmiCrawler/sources/PubChem.py @@ -18,8 +18,11 @@ class PubChem(Source): # website = "https://pubchem.ncbi.nlm.nih.gov/toc/summary_toc.cgi?tocid=27&cid=297" #contains properties to parse - website = 'https://www.ncbi.nlm.nih.gov/*' + website = 'https://*.ncbi.nlm.nih.gov/*' + website_www = 'https://www.ncbi.nlm.nih.gov/*' + website_pubchem = 'https://pubchem.ncbi.nlm.nih.gov/*' search = 'pccompound?term=%s' + data_url = 'toc/summary_toc.cgi?tocid=27&cid=%s' __spider = None searched_compounds = set() @@ -29,26 +32,39 @@ class PubChem(Source): def parse(self, response): """ Distributes the above described behaviour """ + requests = [] log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG) + sel = Selector(response) compound = sel.xpath('//h1/text()').extract()[0] + if compound in self.searched_compounds: + return None + + self.searched_compounds.update(compound) raw_synonyms = sel.xpath('//div[@class="smalltext"]/text()').extract()[0] for synonym in raw_synonyms.strip().split(', '): log.msg('PubChem synonym found: %s' % synonym, level=log.DEBUG) self.searched_compounds.update(synonym) self._spider.get_synonym_requests(synonym) - - log.msg('Raw synonyms found: %s' % raw_synonyms, level=log.DEBUG) - if compound in self.searched_compounds: - return None - else: - # items = self.parse_properties(sel) - items = [] - self.searched_compounds.update(compound) - print items - return items + n = re.search(r'cid=(\d+)',response.url) + if n: + cid = n.group(1) + log.msg('cid: %s' % cid, level=log.DEBUG) + requests.append(Request(url=self.website_pubchem[:-1] + self.data_url % cid, callback=self.parse_data)) + + return requests + + def parse_data(self, response): + log.msg('parsing data', level=log.DEBUG) + requests = [] + + + + + return requests + def parse_properties(self, sel): """ scrape data from 'Chemical and Physical Properties' box on PubChem. """ @@ -83,7 +99,7 @@ class PubChem(Source): return items def new_compound_request(self, compound): - return Request(url=self.website[:-1] + self.search % compound, callback=self.parse) + return Request(url=self.website_www[:-1] + self.search % compound, callback=self.parse) # @staticmethod # def clean_items(items): From ba8f8451786088c12b4645f61261ab4e8d96598b Mon Sep 17 00:00:00 2001 From: Nout van Deijck Date: Mon, 2 Jun 2014 09:26:36 +0200 Subject: [PATCH 06/13] now also (finally) scrapes property values and names, but not yet coupled together and not yet returned. --- FourmiCrawler/sources/PubChem.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py index e2dcc8b..6718900 100644 --- a/FourmiCrawler/sources/PubChem.py +++ b/FourmiCrawler/sources/PubChem.py @@ -60,12 +60,20 @@ class PubChem(Source): log.msg('parsing data', level=log.DEBUG) requests = [] + sel = Selector(response) + # props = sel.xpath('.//div') + prop_values = sel.xpath('//div//a/text()').extract() + prop_names = sel.xpath('//div//a/ancestor::div/b/text()').extract() + print prop_values + print prop_names + # print props return requests + # this (old) definition is only here to help myself def parse_properties(self, sel): """ scrape data from 'Chemical and Physical Properties' box on PubChem. """ items = [] @@ -95,9 +103,9 @@ class PubChem(Source): items = filter(lambda a: a['value'] != '', items) # remove items with an empty value # item_list = self.clean_items(items) - return items + def new_compound_request(self, compound): return Request(url=self.website_www[:-1] + self.search % compound, callback=self.parse) From 291547a5addfb5f79dd8bcc0cb80c798f20f05db Mon Sep 17 00:00:00 2001 From: Nout van Deijck Date: Wed, 4 Jun 2014 15:44:53 +0200 Subject: [PATCH 07/13] now returns good results, with property values and corresponding sources --- FourmiCrawler/sources/PubChem.py | 34 +++++++++++++++++++++++++------- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py index 6718900..1d20231 100644 --- a/FourmiCrawler/sources/PubChem.py +++ b/FourmiCrawler/sources/PubChem.py @@ -61,14 +61,34 @@ class PubChem(Source): requests = [] sel = Selector(response) - # props = sel.xpath('.//div') - prop_values = sel.xpath('//div//a/text()').extract() - prop_names = sel.xpath('//div//a/ancestor::div/b/text()').extract() + props = sel.xpath('//div') - print prop_values - print prop_names - - # print props + for prop in props: + prop_name = ''.join(prop.xpath('b/text()').extract()) + if prop.xpath('a'): + prop_source = ''.join(prop.xpath('a/@title').extract()) + prop_value = ''.join(prop.xpath('a/text()').extract()) + new_prop = Result({ + 'attribute': prop_name, + 'value': prop_value, + 'source': prop_source, + 'reliability': 'Unknown', + 'conditions': '' + }) + requests.append(new_prop) + elif prop.xpath('ul'): + prop_values = prop.xpath('ul//li') + for prop_li in prop_values: + prop_value = ''.join(prop_li.xpath('a/text()').extract()) + prop_source = ''.join(prop_li.xpath('a/@title').extract()) + new_prop = Result({ + 'attribute': prop_name, + 'value': prop_value, + 'source': prop_source, + 'reliability': 'Unknown', + 'conditions': '' + }) + requests.append(new_prop) return requests From f1047405667c789b1a1c4238ae84eeac10834cfe Mon Sep 17 00:00:00 2001 From: Nout van Deijck Date: Wed, 11 Jun 2014 16:39:00 +0200 Subject: [PATCH 08/13] cleaned up useless code --- FourmiCrawler/sources/PubChem.py | 54 +------------------------------- 1 file changed, 1 insertion(+), 53 deletions(-) diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py index 1d20231..6490b20 100644 --- a/FourmiCrawler/sources/PubChem.py +++ b/FourmiCrawler/sources/PubChem.py @@ -12,12 +12,6 @@ class PubChem(Source): This parser parses the part on PubChem pages that gives Chemical and Physical properties of a substance. """ - # TO DO: make url variable with help of PubChem identifier ID / cid - - #website = "https://pubchem.ncbi.nlm.nih.gov/summary/summary.cgi?cid=297" #contains name of compound but not all parsable data - # website = "https://pubchem.ncbi.nlm.nih.gov/toc/summary_toc.cgi?tocid=27&cid=297" #contains properties to parse - - website = 'https://*.ncbi.nlm.nih.gov/*' website_www = 'https://www.ncbi.nlm.nih.gov/*' website_pubchem = 'https://pubchem.ncbi.nlm.nih.gov/*' @@ -93,51 +87,5 @@ class PubChem(Source): return requests - # this (old) definition is only here to help myself - def parse_properties(self, sel): - """ scrape data from 'Chemical and Physical Properties' box on PubChem. """ - items = [] - - - prop_names = sel.xpath('.//div[@id="d27"//div/b').\ - xpath('normalize-space(string())') - prop_values = sel.xpath('.//div[@id="d27"//div/a').\ - xpath('normalize-space(string())') - prop_sources = sel.xpath('.//div[@id="d27"//div/a[@title]').\ - xpath('normalize-space(string())') - - for i, prop_name in enumerate(prop_names): - item = Result({ - 'attribute': prop_name.extract().encode('utf-8'), - 'value': prop_values[i].extract().encode('utf-8'), - 'source': "PubChem: " + prop_sources[i].extract().encode('utf-8'), - 'reliability': "", - 'conditions': "" - }) - items.append(item) - - print item - - log.msg('PubChem prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG) - - items = filter(lambda a: a['value'] != '', items) # remove items with an empty value - # item_list = self.clean_items(items) - - return items - - def new_compound_request(self, compound): - return Request(url=self.website_www[:-1] + self.search % compound, callback=self.parse) - - # @staticmethod - # def clean_items(items): - # """ clean up properties using regex, makes it possible to split the values from the units """ - # for item in items: - # value = item['value'] - # m = re.search('F;\s(\d+[\.,]?\d*)', value) # clean up numerical Kelvin value (after F) - # if m: - # item['value'] = m.group(1) + " K" - # m = re.match('(\d+[\.,]?\d*)\sJ\sK.+mol', value) # clean up J/K/mol values - # if m: - # item['value'] = m.group(1) + " J/K/mol" - # return items + return Request(url=self.website_www[:-1] + self.search % compound, callback=self.parse) \ No newline at end of file From a903e78f9ebe4f855c9ffc0d74ce4faa95831c4f Mon Sep 17 00:00:00 2001 From: Nout van Deijck Date: Wed, 11 Jun 2014 16:40:32 +0200 Subject: [PATCH 09/13] added PubChem to sources.cfg --- sources.cfg | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 sources.cfg diff --git a/sources.cfg b/sources.cfg new file mode 100644 index 0000000..a9fa2fb --- /dev/null +++ b/sources.cfg @@ -0,0 +1,15 @@ +[DEFAULT] +reliability = Unknown + +[ChemSpider] +reliability = High +token = 052bfd06-5ce4-43d6-bf12-89eabefd2338 + +[NIST] +reliability = High + +[WikipediaParser] +reliability = Medium + +[PubChem] +reliability = High \ No newline at end of file From 8836cdf16b758b86bc1e20402b85b2c3d4b11990 Mon Sep 17 00:00:00 2001 From: RTB Date: Wed, 11 Jun 2014 18:39:01 +0200 Subject: [PATCH 10/13] fixed config errors due to merge with develop --- FourmiCrawler/sources/PubChem.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py index 6490b20..ab6a99e 100644 --- a/FourmiCrawler/sources/PubChem.py +++ b/FourmiCrawler/sources/PubChem.py @@ -21,8 +21,9 @@ class PubChem(Source): __spider = None searched_compounds = set() - def __init__(self): - Source.__init__(self) + def __init__(self, config): + Source.__init__(self, config) + self.cfg = config def parse(self, response): """ Distributes the above described behaviour """ @@ -88,4 +89,4 @@ class PubChem(Source): def new_compound_request(self, compound): - return Request(url=self.website_www[:-1] + self.search % compound, callback=self.parse) \ No newline at end of file + return Request(url=self.website_www[:-1] + self.search % compound, callback=self.parse) From 4dc557d9e8e7bb5ac529e0201f577e23aeca29cb Mon Sep 17 00:00:00 2001 From: Nout van Deijck Date: Tue, 17 Jun 2014 00:09:17 +0200 Subject: [PATCH 11/13] Finish plugin (comments, log messages, etc) --- FourmiCrawler/sources/PubChem.py | 33 +++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py index ab6a99e..fc8250b 100644 --- a/FourmiCrawler/sources/PubChem.py +++ b/FourmiCrawler/sources/PubChem.py @@ -9,9 +9,11 @@ import re class PubChem(Source): """ PubChem scraper for chemical properties - This parser parses the part on PubChem pages that gives Chemical and Physical properties of a substance. + This parser parses the part on PubChem pages that gives Chemical and Physical properties of a substance, + including sources of the values of properties. """ + #PubChem has its data on compound name, properties and their values on different html pages, so different URLs used website = 'https://*.ncbi.nlm.nih.gov/*' website_www = 'https://www.ncbi.nlm.nih.gov/*' website_pubchem = 'https://pubchem.ncbi.nlm.nih.gov/*' @@ -26,7 +28,11 @@ class PubChem(Source): self.cfg = config def parse(self, response): - """ Distributes the above described behaviour """ + """ + Distributes the above described behaviour + :param response: The incoming search request + :return Returns the found properties if response is unique or returns none if it's already known + """ requests = [] log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG) @@ -46,12 +52,19 @@ class PubChem(Source): n = re.search(r'cid=(\d+)',response.url) if n: cid = n.group(1) - log.msg('cid: %s' % cid, level=log.DEBUG) - requests.append(Request(url=self.website_pubchem[:-1] + self.data_url % cid, callback=self.parse_data)) + log.msg('cid: %s' % cid, level=log.DEBUG) #getting the right id of the compound with which it can reach + # the seperate html page which contains the properties and their values + #using this cid to get the right url and scrape it + requests.append(Request(url=self.website_pubchem[:-1] + self.data_url % cid, callback=self.parse_data)) return requests def parse_data(self, response): + """ + Parse data found in 'Chemical and Physical properties' part of a substance page. + :param response: The response with the page to parse + :return: requests: Returns a list of properties with their values, source, etc. + """ log.msg('parsing data', level=log.DEBUG) requests = [] @@ -59,8 +72,8 @@ class PubChem(Source): props = sel.xpath('//div') for prop in props: - prop_name = ''.join(prop.xpath('b/text()').extract()) - if prop.xpath('a'): + prop_name = ''.join(prop.xpath('b/text()').extract()) # name of property that it is parsing + if prop.xpath('a'): # parsing for single value in property prop_source = ''.join(prop.xpath('a/@title').extract()) prop_value = ''.join(prop.xpath('a/text()').extract()) new_prop = Result({ @@ -70,8 +83,11 @@ class PubChem(Source): 'reliability': 'Unknown', 'conditions': '' }) + log.msg('PubChem prop: |%s| |%s| |%s|' % + (new_prop['attribute'], new_prop['value'], + new_prop['source']), level=log.DEBUG) requests.append(new_prop) - elif prop.xpath('ul'): + elif prop.xpath('ul'): # parsing for multiple values (list) in property prop_values = prop.xpath('ul//li') for prop_li in prop_values: prop_value = ''.join(prop_li.xpath('a/text()').extract()) @@ -83,6 +99,9 @@ class PubChem(Source): 'reliability': 'Unknown', 'conditions': '' }) + log.msg('PubChem prop: |%s| |%s| |%s|' % + (new_prop['attribute'], new_prop['value'], + new_prop['source']), level=log.DEBUG) requests.append(new_prop) return requests From 56e1d3cfb6a785b3a2b444a93eeca2fb02b2be88 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Tue, 17 Jun 2014 00:28:01 +0200 Subject: [PATCH 12/13] No cofig files should be included on github --- sources.cfg | 15 --------------- 1 file changed, 15 deletions(-) delete mode 100644 sources.cfg diff --git a/sources.cfg b/sources.cfg deleted file mode 100644 index a9fa2fb..0000000 --- a/sources.cfg +++ /dev/null @@ -1,15 +0,0 @@ -[DEFAULT] -reliability = Unknown - -[ChemSpider] -reliability = High -token = 052bfd06-5ce4-43d6-bf12-89eabefd2338 - -[NIST] -reliability = High - -[WikipediaParser] -reliability = Medium - -[PubChem] -reliability = High \ No newline at end of file From 6e16e9f23e19016ac5a5d3eff3dd4e07cdf9e8c8 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Tue, 17 Jun 2014 00:33:08 +0200 Subject: [PATCH 13/13] TODO on sppofing user agent --- FourmiCrawler/settings.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/FourmiCrawler/settings.py b/FourmiCrawler/settings.py index 320f573..338f224 100644 --- a/FourmiCrawler/settings.py +++ b/FourmiCrawler/settings.py @@ -18,10 +18,10 @@ ITEM_PIPELINES = { FEED_URI = 'results.json' FEED_FORMAT = 'jsonlines' -USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36' - - # Crawl responsibly by identifying yourself (and your website) on the # user-agent +# [todo] - Check for repercussions on spoofing the user agent + # USER_AGENT = 'FourmiCrawler (+http://www.yourdomain.com)' +USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36'