From f728dff6b09614f98b51b756c3bbd4b7f3cda12f Mon Sep 17 00:00:00 2001 From: Nout van Deijck Date: Wed, 14 May 2014 12:01:05 +0200 Subject: [PATCH 01/16] Developing PubChem parser, first draft, not tested nor finished completely --- FourmiCrawler/sources/PubChem.py | 84 ++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 FourmiCrawler/sources/PubChem.py diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py new file mode 100644 index 0000000..00b2cd7 --- /dev/null +++ b/FourmiCrawler/sources/PubChem.py @@ -0,0 +1,84 @@ +from scrapy.http import Request +from scrapy import log +from source import Source +from scrapy.selector import Selector +from FourmiCrawler.items import Result +import re + + +class PubChem(Source): + """ PubChem scraper for chemical properties + + This parser parses the part on PubChem pages that gives Chemical and Physical properties of a substance. + """ + + # TO DO: make url variable with help of PubChem identifier ID given by Wikipedia + + #website = "https://pubchem.ncbi.nlm.nih.gov/summary/summary.cgi?cid=297" #contains name of compound but not all parsable data + website = "https://pubchem.ncbi.nlm.nih.gov/toc/summary_toc.cgi?tocid=27&cid=297" #contains properties to parse + + __spider = None + searched_compounds = [] + + def __init__(self): + Source.__init__(self) + + def parse(self, response): + """ Distributes the above described behaviour """ + log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG) + sel = Selector(response) + compound = sel.xpath('//h1/text()').extract()[0] + if compound in self.searched_compounds: + return None + else: + items = self.parse_properties(sel) + self.searched_compounds.append(compound) + return items + + def parse_properties(self, sel): + """ scrape data from 'Chemical and Physical Properties' box on PubChem. """ + items = [] + + + prop_names = sel.xpath('.//div[@id="d27"//div/b').\ + xpath('normalize-space(string())') + prop_values = sel.xpath('.//div[@id="d27"//div/a').\ + xpath('normalize-space(string())') + prop_sources = sel.xpath('.//div[@id="d27"//div/a[@title]').\ + xpath('normalize-space(string())') + + for i, prop_name in enumerate(prop_names): + item = Result({ + 'attribute': prop_name.extract().encode('utf-8'), + 'value': prop_values[i].extract().encode('utf-8'), + 'source': "PubChem: " + prop_sources[i].extract().encode('utf-8'), + 'reliability': "", + 'conditions': "" + }) + items.append(item) + + print item + + log.msg('PubChem prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG) + + items = filter(lambda a: a['value'] != '', items) # remove items with an empty value + # item_list = self.clean_items(items) + + + return items + + def new_compound_request(self, compound): + return Request(url=self.website[:-1] + compound, callback=self.parse) + + # @staticmethod + # def clean_items(items): + # """ clean up properties using regex, makes it possible to split the values from the units """ + # for item in items: + # value = item['value'] + # m = re.search('F;\s(\d+[\.,]?\d*)', value) # clean up numerical Kelvin value (after F) + # if m: + # item['value'] = m.group(1) + " K" + # m = re.match('(\d+[\.,]?\d*)\sJ\sK.+mol', value) # clean up J/K/mol values + # if m: + # item['value'] = m.group(1) + " J/K/mol" + # return items From 84f2e3dbea9a2f137bf7c441bb347313cccdf11d Mon Sep 17 00:00:00 2001 From: Nout van Deijck Date: Wed, 21 May 2014 14:53:51 +0200 Subject: [PATCH 02/16] Testing search function PubChem --- FourmiCrawler/sources/PubChem.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py index 00b2cd7..d34a2cb 100644 --- a/FourmiCrawler/sources/PubChem.py +++ b/FourmiCrawler/sources/PubChem.py @@ -12,10 +12,16 @@ class PubChem(Source): This parser parses the part on PubChem pages that gives Chemical and Physical properties of a substance. """ - # TO DO: make url variable with help of PubChem identifier ID given by Wikipedia + # TO DO: make url variable with help of PubChem identifier ID / cid #website = "https://pubchem.ncbi.nlm.nih.gov/summary/summary.cgi?cid=297" #contains name of compound but not all parsable data - website = "https://pubchem.ncbi.nlm.nih.gov/toc/summary_toc.cgi?tocid=27&cid=297" #contains properties to parse + # website = "https://pubchem.ncbi.nlm.nih.gov/toc/summary_toc.cgi?tocid=27&cid=297" #contains properties to parse + + + website = 'https://www.ncbi.nlm.nih.gov/*' + + + search = 'pccompound?term=%s' __spider = None searched_compounds = [] @@ -31,8 +37,10 @@ class PubChem(Source): if compound in self.searched_compounds: return None else: - items = self.parse_properties(sel) + # items = self.parse_properties(sel) + items = [] self.searched_compounds.append(compound) + print items return items def parse_properties(self, sel): @@ -68,7 +76,7 @@ class PubChem(Source): return items def new_compound_request(self, compound): - return Request(url=self.website[:-1] + compound, callback=self.parse) + return Request(url=self.website[:-1] + self.search % compound, callback=self.parse) # @staticmethod # def clean_items(items): From 4b377bb9a966e4b1fd82101e865d70fae0c30b1c Mon Sep 17 00:00:00 2001 From: Nout van Deijck Date: Wed, 21 May 2014 15:25:55 +0200 Subject: [PATCH 03/16] PubChem now scrapes its synonyms --- FourmiCrawler/sources/PubChem.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py index d34a2cb..0ce727f 100644 --- a/FourmiCrawler/sources/PubChem.py +++ b/FourmiCrawler/sources/PubChem.py @@ -19,12 +19,10 @@ class PubChem(Source): website = 'https://www.ncbi.nlm.nih.gov/*' - - search = 'pccompound?term=%s' __spider = None - searched_compounds = [] + searched_compounds = set() def __init__(self): Source.__init__(self) @@ -34,12 +32,21 @@ class PubChem(Source): log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG) sel = Selector(response) compound = sel.xpath('//h1/text()').extract()[0] + raw_synonyms = sel.xpath('//div[@class="smalltext"]/text()').extract()[0] + for synonym in raw_synonyms.strip().split(', '): + log.msg('PubChem synonym found: %s' % synonym, level=log.DEBUG) + self.searched_compounds.update(synonym) + self._spider.get_synonym_requests(synonym) + + + log.msg('Raw synonyms found: %s' % raw_synonyms, level=log.DEBUG) + if compound in self.searched_compounds: return None else: # items = self.parse_properties(sel) items = [] - self.searched_compounds.append(compound) + self.searched_compounds.update(compound) print items return items From fb41d772f203b420784582732ea64fd45d96c51d Mon Sep 17 00:00:00 2001 From: Nout van Deijck Date: Wed, 21 May 2014 16:11:02 +0200 Subject: [PATCH 04/16] Added custom user-agent because otherwise it would block, because not amused by scraper --- FourmiCrawler/settings.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/FourmiCrawler/settings.py b/FourmiCrawler/settings.py index be91fef..490a3a5 100644 --- a/FourmiCrawler/settings.py +++ b/FourmiCrawler/settings.py @@ -16,6 +16,8 @@ ITEM_PIPELINES = { FEED_URI = 'results.json' FEED_FORMAT = 'jsonlines' +USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36' + # Crawl responsibly by identifying yourself (and your website) on the # user-agent From 8083d0c7bc03459de2aab224a811653389aa0ebf Mon Sep 17 00:00:00 2001 From: Nout van Deijck Date: Wed, 21 May 2014 16:11:48 +0200 Subject: [PATCH 05/16] PubChem scrapes synonyms, gets custom url to get data on properties from --- FourmiCrawler/sources/PubChem.py | 40 ++++++++++++++++++++++---------- 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py index 0ce727f..e2dcc8b 100644 --- a/FourmiCrawler/sources/PubChem.py +++ b/FourmiCrawler/sources/PubChem.py @@ -18,8 +18,11 @@ class PubChem(Source): # website = "https://pubchem.ncbi.nlm.nih.gov/toc/summary_toc.cgi?tocid=27&cid=297" #contains properties to parse - website = 'https://www.ncbi.nlm.nih.gov/*' + website = 'https://*.ncbi.nlm.nih.gov/*' + website_www = 'https://www.ncbi.nlm.nih.gov/*' + website_pubchem = 'https://pubchem.ncbi.nlm.nih.gov/*' search = 'pccompound?term=%s' + data_url = 'toc/summary_toc.cgi?tocid=27&cid=%s' __spider = None searched_compounds = set() @@ -29,26 +32,39 @@ class PubChem(Source): def parse(self, response): """ Distributes the above described behaviour """ + requests = [] log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG) + sel = Selector(response) compound = sel.xpath('//h1/text()').extract()[0] + if compound in self.searched_compounds: + return None + + self.searched_compounds.update(compound) raw_synonyms = sel.xpath('//div[@class="smalltext"]/text()').extract()[0] for synonym in raw_synonyms.strip().split(', '): log.msg('PubChem synonym found: %s' % synonym, level=log.DEBUG) self.searched_compounds.update(synonym) self._spider.get_synonym_requests(synonym) - - log.msg('Raw synonyms found: %s' % raw_synonyms, level=log.DEBUG) - if compound in self.searched_compounds: - return None - else: - # items = self.parse_properties(sel) - items = [] - self.searched_compounds.update(compound) - print items - return items + n = re.search(r'cid=(\d+)',response.url) + if n: + cid = n.group(1) + log.msg('cid: %s' % cid, level=log.DEBUG) + requests.append(Request(url=self.website_pubchem[:-1] + self.data_url % cid, callback=self.parse_data)) + + return requests + + def parse_data(self, response): + log.msg('parsing data', level=log.DEBUG) + requests = [] + + + + + return requests + def parse_properties(self, sel): """ scrape data from 'Chemical and Physical Properties' box on PubChem. """ @@ -83,7 +99,7 @@ class PubChem(Source): return items def new_compound_request(self, compound): - return Request(url=self.website[:-1] + self.search % compound, callback=self.parse) + return Request(url=self.website_www[:-1] + self.search % compound, callback=self.parse) # @staticmethod # def clean_items(items): From ba8f8451786088c12b4645f61261ab4e8d96598b Mon Sep 17 00:00:00 2001 From: Nout van Deijck Date: Mon, 2 Jun 2014 09:26:36 +0200 Subject: [PATCH 06/16] now also (finally) scrapes property values and names, but not yet coupled together and not yet returned. --- FourmiCrawler/sources/PubChem.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py index e2dcc8b..6718900 100644 --- a/FourmiCrawler/sources/PubChem.py +++ b/FourmiCrawler/sources/PubChem.py @@ -60,12 +60,20 @@ class PubChem(Source): log.msg('parsing data', level=log.DEBUG) requests = [] + sel = Selector(response) + # props = sel.xpath('.//div') + prop_values = sel.xpath('//div//a/text()').extract() + prop_names = sel.xpath('//div//a/ancestor::div/b/text()').extract() + print prop_values + print prop_names + # print props return requests + # this (old) definition is only here to help myself def parse_properties(self, sel): """ scrape data from 'Chemical and Physical Properties' box on PubChem. """ items = [] @@ -95,9 +103,9 @@ class PubChem(Source): items = filter(lambda a: a['value'] != '', items) # remove items with an empty value # item_list = self.clean_items(items) - return items + def new_compound_request(self, compound): return Request(url=self.website_www[:-1] + self.search % compound, callback=self.parse) From 291547a5addfb5f79dd8bcc0cb80c798f20f05db Mon Sep 17 00:00:00 2001 From: Nout van Deijck Date: Wed, 4 Jun 2014 15:44:53 +0200 Subject: [PATCH 07/16] now returns good results, with property values and corresponding sources --- FourmiCrawler/sources/PubChem.py | 34 +++++++++++++++++++++++++------- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py index 6718900..1d20231 100644 --- a/FourmiCrawler/sources/PubChem.py +++ b/FourmiCrawler/sources/PubChem.py @@ -61,14 +61,34 @@ class PubChem(Source): requests = [] sel = Selector(response) - # props = sel.xpath('.//div') - prop_values = sel.xpath('//div//a/text()').extract() - prop_names = sel.xpath('//div//a/ancestor::div/b/text()').extract() + props = sel.xpath('//div') - print prop_values - print prop_names - - # print props + for prop in props: + prop_name = ''.join(prop.xpath('b/text()').extract()) + if prop.xpath('a'): + prop_source = ''.join(prop.xpath('a/@title').extract()) + prop_value = ''.join(prop.xpath('a/text()').extract()) + new_prop = Result({ + 'attribute': prop_name, + 'value': prop_value, + 'source': prop_source, + 'reliability': 'Unknown', + 'conditions': '' + }) + requests.append(new_prop) + elif prop.xpath('ul'): + prop_values = prop.xpath('ul//li') + for prop_li in prop_values: + prop_value = ''.join(prop_li.xpath('a/text()').extract()) + prop_source = ''.join(prop_li.xpath('a/@title').extract()) + new_prop = Result({ + 'attribute': prop_name, + 'value': prop_value, + 'source': prop_source, + 'reliability': 'Unknown', + 'conditions': '' + }) + requests.append(new_prop) return requests From f1047405667c789b1a1c4238ae84eeac10834cfe Mon Sep 17 00:00:00 2001 From: Nout van Deijck Date: Wed, 11 Jun 2014 16:39:00 +0200 Subject: [PATCH 08/16] cleaned up useless code --- FourmiCrawler/sources/PubChem.py | 54 +------------------------------- 1 file changed, 1 insertion(+), 53 deletions(-) diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py index 1d20231..6490b20 100644 --- a/FourmiCrawler/sources/PubChem.py +++ b/FourmiCrawler/sources/PubChem.py @@ -12,12 +12,6 @@ class PubChem(Source): This parser parses the part on PubChem pages that gives Chemical and Physical properties of a substance. """ - # TO DO: make url variable with help of PubChem identifier ID / cid - - #website = "https://pubchem.ncbi.nlm.nih.gov/summary/summary.cgi?cid=297" #contains name of compound but not all parsable data - # website = "https://pubchem.ncbi.nlm.nih.gov/toc/summary_toc.cgi?tocid=27&cid=297" #contains properties to parse - - website = 'https://*.ncbi.nlm.nih.gov/*' website_www = 'https://www.ncbi.nlm.nih.gov/*' website_pubchem = 'https://pubchem.ncbi.nlm.nih.gov/*' @@ -93,51 +87,5 @@ class PubChem(Source): return requests - # this (old) definition is only here to help myself - def parse_properties(self, sel): - """ scrape data from 'Chemical and Physical Properties' box on PubChem. """ - items = [] - - - prop_names = sel.xpath('.//div[@id="d27"//div/b').\ - xpath('normalize-space(string())') - prop_values = sel.xpath('.//div[@id="d27"//div/a').\ - xpath('normalize-space(string())') - prop_sources = sel.xpath('.//div[@id="d27"//div/a[@title]').\ - xpath('normalize-space(string())') - - for i, prop_name in enumerate(prop_names): - item = Result({ - 'attribute': prop_name.extract().encode('utf-8'), - 'value': prop_values[i].extract().encode('utf-8'), - 'source': "PubChem: " + prop_sources[i].extract().encode('utf-8'), - 'reliability': "", - 'conditions': "" - }) - items.append(item) - - print item - - log.msg('PubChem prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG) - - items = filter(lambda a: a['value'] != '', items) # remove items with an empty value - # item_list = self.clean_items(items) - - return items - - def new_compound_request(self, compound): - return Request(url=self.website_www[:-1] + self.search % compound, callback=self.parse) - - # @staticmethod - # def clean_items(items): - # """ clean up properties using regex, makes it possible to split the values from the units """ - # for item in items: - # value = item['value'] - # m = re.search('F;\s(\d+[\.,]?\d*)', value) # clean up numerical Kelvin value (after F) - # if m: - # item['value'] = m.group(1) + " K" - # m = re.match('(\d+[\.,]?\d*)\sJ\sK.+mol', value) # clean up J/K/mol values - # if m: - # item['value'] = m.group(1) + " J/K/mol" - # return items + return Request(url=self.website_www[:-1] + self.search % compound, callback=self.parse) \ No newline at end of file From a903e78f9ebe4f855c9ffc0d74ce4faa95831c4f Mon Sep 17 00:00:00 2001 From: Nout van Deijck Date: Wed, 11 Jun 2014 16:40:32 +0200 Subject: [PATCH 09/16] added PubChem to sources.cfg --- sources.cfg | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 sources.cfg diff --git a/sources.cfg b/sources.cfg new file mode 100644 index 0000000..a9fa2fb --- /dev/null +++ b/sources.cfg @@ -0,0 +1,15 @@ +[DEFAULT] +reliability = Unknown + +[ChemSpider] +reliability = High +token = 052bfd06-5ce4-43d6-bf12-89eabefd2338 + +[NIST] +reliability = High + +[WikipediaParser] +reliability = Medium + +[PubChem] +reliability = High \ No newline at end of file From 8836cdf16b758b86bc1e20402b85b2c3d4b11990 Mon Sep 17 00:00:00 2001 From: RTB Date: Wed, 11 Jun 2014 18:39:01 +0200 Subject: [PATCH 10/16] fixed config errors due to merge with develop --- FourmiCrawler/sources/PubChem.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py index 6490b20..ab6a99e 100644 --- a/FourmiCrawler/sources/PubChem.py +++ b/FourmiCrawler/sources/PubChem.py @@ -21,8 +21,9 @@ class PubChem(Source): __spider = None searched_compounds = set() - def __init__(self): - Source.__init__(self) + def __init__(self, config): + Source.__init__(self, config) + self.cfg = config def parse(self, response): """ Distributes the above described behaviour """ @@ -88,4 +89,4 @@ class PubChem(Source): def new_compound_request(self, compound): - return Request(url=self.website_www[:-1] + self.search % compound, callback=self.parse) \ No newline at end of file + return Request(url=self.website_www[:-1] + self.search % compound, callback=self.parse) From 4dc557d9e8e7bb5ac529e0201f577e23aeca29cb Mon Sep 17 00:00:00 2001 From: Nout van Deijck Date: Tue, 17 Jun 2014 00:09:17 +0200 Subject: [PATCH 11/16] Finish plugin (comments, log messages, etc) --- FourmiCrawler/sources/PubChem.py | 33 +++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py index ab6a99e..fc8250b 100644 --- a/FourmiCrawler/sources/PubChem.py +++ b/FourmiCrawler/sources/PubChem.py @@ -9,9 +9,11 @@ import re class PubChem(Source): """ PubChem scraper for chemical properties - This parser parses the part on PubChem pages that gives Chemical and Physical properties of a substance. + This parser parses the part on PubChem pages that gives Chemical and Physical properties of a substance, + including sources of the values of properties. """ + #PubChem has its data on compound name, properties and their values on different html pages, so different URLs used website = 'https://*.ncbi.nlm.nih.gov/*' website_www = 'https://www.ncbi.nlm.nih.gov/*' website_pubchem = 'https://pubchem.ncbi.nlm.nih.gov/*' @@ -26,7 +28,11 @@ class PubChem(Source): self.cfg = config def parse(self, response): - """ Distributes the above described behaviour """ + """ + Distributes the above described behaviour + :param response: The incoming search request + :return Returns the found properties if response is unique or returns none if it's already known + """ requests = [] log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG) @@ -46,12 +52,19 @@ class PubChem(Source): n = re.search(r'cid=(\d+)',response.url) if n: cid = n.group(1) - log.msg('cid: %s' % cid, level=log.DEBUG) - requests.append(Request(url=self.website_pubchem[:-1] + self.data_url % cid, callback=self.parse_data)) + log.msg('cid: %s' % cid, level=log.DEBUG) #getting the right id of the compound with which it can reach + # the seperate html page which contains the properties and their values + #using this cid to get the right url and scrape it + requests.append(Request(url=self.website_pubchem[:-1] + self.data_url % cid, callback=self.parse_data)) return requests def parse_data(self, response): + """ + Parse data found in 'Chemical and Physical properties' part of a substance page. + :param response: The response with the page to parse + :return: requests: Returns a list of properties with their values, source, etc. + """ log.msg('parsing data', level=log.DEBUG) requests = [] @@ -59,8 +72,8 @@ class PubChem(Source): props = sel.xpath('//div') for prop in props: - prop_name = ''.join(prop.xpath('b/text()').extract()) - if prop.xpath('a'): + prop_name = ''.join(prop.xpath('b/text()').extract()) # name of property that it is parsing + if prop.xpath('a'): # parsing for single value in property prop_source = ''.join(prop.xpath('a/@title').extract()) prop_value = ''.join(prop.xpath('a/text()').extract()) new_prop = Result({ @@ -70,8 +83,11 @@ class PubChem(Source): 'reliability': 'Unknown', 'conditions': '' }) + log.msg('PubChem prop: |%s| |%s| |%s|' % + (new_prop['attribute'], new_prop['value'], + new_prop['source']), level=log.DEBUG) requests.append(new_prop) - elif prop.xpath('ul'): + elif prop.xpath('ul'): # parsing for multiple values (list) in property prop_values = prop.xpath('ul//li') for prop_li in prop_values: prop_value = ''.join(prop_li.xpath('a/text()').extract()) @@ -83,6 +99,9 @@ class PubChem(Source): 'reliability': 'Unknown', 'conditions': '' }) + log.msg('PubChem prop: |%s| |%s| |%s|' % + (new_prop['attribute'], new_prop['value'], + new_prop['source']), level=log.DEBUG) requests.append(new_prop) return requests From 56e1d3cfb6a785b3a2b444a93eeca2fb02b2be88 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Tue, 17 Jun 2014 00:28:01 +0200 Subject: [PATCH 12/16] No cofig files should be included on github --- sources.cfg | 15 --------------- 1 file changed, 15 deletions(-) delete mode 100644 sources.cfg diff --git a/sources.cfg b/sources.cfg deleted file mode 100644 index a9fa2fb..0000000 --- a/sources.cfg +++ /dev/null @@ -1,15 +0,0 @@ -[DEFAULT] -reliability = Unknown - -[ChemSpider] -reliability = High -token = 052bfd06-5ce4-43d6-bf12-89eabefd2338 - -[NIST] -reliability = High - -[WikipediaParser] -reliability = Medium - -[PubChem] -reliability = High \ No newline at end of file From 6e16e9f23e19016ac5a5d3eff3dd4e07cdf9e8c8 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Tue, 17 Jun 2014 00:33:08 +0200 Subject: [PATCH 13/16] TODO on sppofing user agent --- FourmiCrawler/settings.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/FourmiCrawler/settings.py b/FourmiCrawler/settings.py index 320f573..338f224 100644 --- a/FourmiCrawler/settings.py +++ b/FourmiCrawler/settings.py @@ -18,10 +18,10 @@ ITEM_PIPELINES = { FEED_URI = 'results.json' FEED_FORMAT = 'jsonlines' -USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36' - - # Crawl responsibly by identifying yourself (and your website) on the # user-agent +# [todo] - Check for repercussions on spoofing the user agent + # USER_AGENT = 'FourmiCrawler (+http://www.yourdomain.com)' +USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36' From 25bf003bdbda36095bc5d972820bfb5666c8765c Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Tue, 17 Jun 2014 00:35:50 +0200 Subject: [PATCH 14/16] Added pubchem to changelod --- Changelog.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Changelog.md b/Changelog.md index 99d61fb..b1885f6 100644 --- a/Changelog.md +++ b/Changelog.md @@ -1,6 +1,7 @@ ### v0.5.3 - FIX: It is now again possible to use both verbose and the source inclusion/exclusion options - FIX: Logging is now "actually" disabled if not using the verbose option. +- FEATURE: Added support for PubChem ### v0.5.2 - FIX: Signatured used to contain untracked and older files, current signature @@ -8,4 +9,4 @@ should be correct. ### v0.5.1 - UPDATED: Logging functionality from command line -- DEV: Code cleanup and extra tests \ No newline at end of file +- DEV: Code cleanup and extra tests From bb62c335d2872d16d40e04830646adc6df59d20a Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Tue, 17 Jun 2014 00:36:31 +0200 Subject: [PATCH 15/16] Bumped version number --- fourmi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fourmi.py b/fourmi.py index 9408818..86f2808 100755 --- a/fourmi.py +++ b/fourmi.py @@ -69,7 +69,7 @@ def search(docopt_arguments, source_loader): # The start for the Fourmi Command Line interface. if __name__ == '__main__': - arguments = docopt.docopt(__doc__, version='Fourmi - V0.5.2') + arguments = docopt.docopt(__doc__, version='Fourmi - V0.5.3') loader = SourceLoader() if arguments["--include"]: From 35fe51d9161ba1d9bc2147125c54e0fb701008ea Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Tue, 17 Jun 2014 00:37:34 +0200 Subject: [PATCH 16/16] Signed the new version --- SIGNED.md | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/SIGNED.md b/SIGNED.md index 35d0887..3fc4507 100644 --- a/SIGNED.md +++ b/SIGNED.md @@ -3,19 +3,19 @@ -----BEGIN PGP SIGNATURE----- Version: GnuPG v1.4.11 (GNU/Linux) -iQIcBAABAgAGBQJTnfIhAAoJEJrQ9RIUCT6/SbIQANKLzmkxwH11vM84kkRbmgHE -d3jLYYNEDQArCTOObYxvyrvE0BK2fhzbdBfccO9rLqu19FnBhcN3WLbkb/WM+2af -G8GkC7yFsWPs1lkrBbouvObPmqwVChGhRETd7xNU6D1NRGKLDT9lXv1FkjU2qt6P -CQwF129aTRzCZ9XGoVKG9wnKuaPm2EYkYHKlG3eck+eeKklTlmJcGi5ON7iGsUpE -hNVrSg8WwN4SzpOEgXlyBn9Zzci81XeZqy3Fnp7u1CEq5tOuWITXa1i5wQ9Jq/2n -5HP0XLbY5grW6Cpqh5jDUiX/XnNtCwpPWRnz4lCLswwMIDLCpq5tJubIay7GMvsx -fV1+UUGAR1EcWNWI0R6XJNbb2EHzidDJcLWlVo1InJDxevECq3CNnh7fRC9bixiG -EV0C/Abig/rvyX5cc9ozmwO3e0gzmtwwyywxOWLzJgVns3jfuA9MhaGDczIC1kuR -Tig9ciByErhT6v8SjgS3gyhWc+tRSx5R3M1Y78CungW3c61VA3Jo/fWHY6Db0JwH -9lVnGU4Ql4mbQQQAv7e/6r6ZhYwoBsAkOKdqT4Dn8aLaItZ8+oB2FXEl/P6V55hN -ambDSt476mwJcyDyIIwxTLyqcop2zYBdaUATe8lwo+0OoXuCLfjnThkHzy2dA0CP -xqHuzkM3Pdb6qOU3cUK7 -=PVt+ +iQIcBAABAgAGBQJTn3GgAAoJEJrQ9RIUCT6/CI4P/RSAQrd6JugGZoQu/gNdW6eB +MYCybqYGZiieVhUaGOnFNVlp68YpXH+sP/Uc6hXEX30UQEsDmhMeT5NA7ZMS+zJ9 +MNHGQdJq22lGb3+VoVBV4RTMdkQXOXvx6p5biskjIEtM3tfTxP529GvAX2TFUNnt +gGWk28EDr30M95XwDxwWo+57Xv8VtSb3VSvXEbrdwGYf8EoQo9oPtzYQ0YcdupcC +ET8bukYVcwpAjoTnPlEy89TiHHohwmimr2ASXeQ64Ks5wfjzcF7NENCAmaAfR+KI +VLLuGqdWMBx1ewVuAXTCZ0Mga/kBoRUaO0PC13UmL8LhhZY9Z3cwD4UnPU35/RQi +IbLfQcZHf/gEvyMeiTYCsyWpm+/xxn1+EfHol4/Q9VSXzZgRBX05Ik6tqeCvjdgG +4PyHBaJTTm/HfMNdg3mr1mbyjTv5UxglEyPv+Y4NdfoVfepkXsXbzvNSyVffZ3Bw +UaFp7KzIC4Jugdpv63FleiAdDY0+iZ5shH86wD1+HJ0/a87kn5Ao1yESby7J7U+f +poZQYeMFeuC0T5hY/3iYoyvZ68oH918ESESiucSulp5BvfwuqGL2+xo5uJIwGYXE +3IDQC7xbA14JHX86IVJlSHAD33iWyiC+5yjw4/bRRVl37KPsLdHiXH3YIRnF5I2I +ZbM/uDYyJdZbBe4UoCoF +=AMhi -----END PGP SIGNATURE----- ``` @@ -31,22 +31,23 @@ size exec file contents ./ 375 .gitignore d2e475a6a4fa51422cac0a07495914e776858fb9ab9c8937a4d491a3e042d6b1 464 .travis.yml 3063ba078607b8d16bd6467afc15fbbaa4b26c1e30be5ce7cef453cfccbaa95c -208 Changelog.md 370ecb699890e839e73e22822286b2b2ee7e7ec6c485908e10b8c30e7f9acd47 +428 Changelog.md c7791d1914ddca9ff1549d90468a79787a7feafe94cecd756e3d7cbd4bcbc7df FourmiCrawler/ 0 __init__.py e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 304 items.py b00d49a3d53fa13306c7f8b023adb93ab88423c4fce46600689814f6b02bb806 2178 pipelines.py f9b7b84938060751e15e45de5133dffe50c798bff2a20019206fe7c9d677ad49 -716 settings.py 37a8f63e123bccc77076d574617a522b30c1d7c5e893ec3d78cc40e1563dd8a6 +914 settings.py 0be2eaf8e83e85ed27754c896421180fc80cb5ce44449aa9f1048e465d1a96f2 sources/ 9991 ChemSpider.py 847013e34c5c3683ec66a337837287512b4bab9fbea2ece12e4130ab0dbf264d 9898 NIST.py 97abc84fce85c47b789822715a1945ab84cc052a32340c861141c1af66bab644 +4754 PubChem.py 58ed4c92519e385f2768cf8034b006b18f8a21632cb1c5a0849b1a329a8c6ffb 6907 WikipediaParser.py 5d6de911c773129a34b76c40a9b547aafc67644a15f39cd0be6afc7a16fb0f97 0 __init__.py e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 1262 source.py 16c4cdfca849b7dc2bc89d7a6f7ad021f4aa1d04234394312f1d0edf0fd9c5a4 3026 spider.py 1ffba2512988b7a6b535a4a31a4ef688ece4f8c595c3d50355c34ef46b23e44a 1081 LICENSE 36951e5f1910bad3e008ab7228f35ad8933192e52d3c3ae6a5e875765e27192c 3965 README.md d21236d6a175be28ef8e2fee8a256e95b6a513163e3f1071c26c62e9093db7f3 -3659 x fourmi.py 7b4202ecfc8726fcc3f211c459aada7f5610fa4c4c0a7b916e44fc12d71010a1 +3676 x fourmi.py 2ff89f97fd2a49d08417d9ab6cf08e88944d0c45f54ec84550b530be48676c23 261 scrapy.cfg 624c068fd06303daa65b8e0d0d3ef88ac1f123be2694ef5b4f3f9a9dcd983f85 tests/ 1 __init__.py 01ba4719c80b6fe911b091a7c05124b64eeece964e09c058ef8f9805daca546b