From f728dff6b09614f98b51b756c3bbd4b7f3cda12f Mon Sep 17 00:00:00 2001 From: Nout van Deijck Date: Wed, 14 May 2014 12:01:05 +0200 Subject: [PATCH 01/38] Developing PubChem parser, first draft, not tested nor finished completely --- FourmiCrawler/sources/PubChem.py | 84 ++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 FourmiCrawler/sources/PubChem.py diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py new file mode 100644 index 0000000..00b2cd7 --- /dev/null +++ b/FourmiCrawler/sources/PubChem.py @@ -0,0 +1,84 @@ +from scrapy.http import Request +from scrapy import log +from source import Source +from scrapy.selector import Selector +from FourmiCrawler.items import Result +import re + + +class PubChem(Source): + """ PubChem scraper for chemical properties + + This parser parses the part on PubChem pages that gives Chemical and Physical properties of a substance. + """ + + # TO DO: make url variable with help of PubChem identifier ID given by Wikipedia + + #website = "https://pubchem.ncbi.nlm.nih.gov/summary/summary.cgi?cid=297" #contains name of compound but not all parsable data + website = "https://pubchem.ncbi.nlm.nih.gov/toc/summary_toc.cgi?tocid=27&cid=297" #contains properties to parse + + __spider = None + searched_compounds = [] + + def __init__(self): + Source.__init__(self) + + def parse(self, response): + """ Distributes the above described behaviour """ + log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG) + sel = Selector(response) + compound = sel.xpath('//h1/text()').extract()[0] + if compound in self.searched_compounds: + return None + else: + items = self.parse_properties(sel) + self.searched_compounds.append(compound) + return items + + def parse_properties(self, sel): + """ scrape data from 'Chemical and Physical Properties' box on PubChem. """ + items = [] + + + prop_names = sel.xpath('.//div[@id="d27"//div/b').\ + xpath('normalize-space(string())') + prop_values = sel.xpath('.//div[@id="d27"//div/a').\ + xpath('normalize-space(string())') + prop_sources = sel.xpath('.//div[@id="d27"//div/a[@title]').\ + xpath('normalize-space(string())') + + for i, prop_name in enumerate(prop_names): + item = Result({ + 'attribute': prop_name.extract().encode('utf-8'), + 'value': prop_values[i].extract().encode('utf-8'), + 'source': "PubChem: " + prop_sources[i].extract().encode('utf-8'), + 'reliability': "", + 'conditions': "" + }) + items.append(item) + + print item + + log.msg('PubChem prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG) + + items = filter(lambda a: a['value'] != '', items) # remove items with an empty value + # item_list = self.clean_items(items) + + + return items + + def new_compound_request(self, compound): + return Request(url=self.website[:-1] + compound, callback=self.parse) + + # @staticmethod + # def clean_items(items): + # """ clean up properties using regex, makes it possible to split the values from the units """ + # for item in items: + # value = item['value'] + # m = re.search('F;\s(\d+[\.,]?\d*)', value) # clean up numerical Kelvin value (after F) + # if m: + # item['value'] = m.group(1) + " K" + # m = re.match('(\d+[\.,]?\d*)\sJ\sK.+mol', value) # clean up J/K/mol values + # if m: + # item['value'] = m.group(1) + " J/K/mol" + # return items From 84f2e3dbea9a2f137bf7c441bb347313cccdf11d Mon Sep 17 00:00:00 2001 From: Nout van Deijck Date: Wed, 21 May 2014 14:53:51 +0200 Subject: [PATCH 02/38] Testing search function PubChem --- FourmiCrawler/sources/PubChem.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py index 00b2cd7..d34a2cb 100644 --- a/FourmiCrawler/sources/PubChem.py +++ b/FourmiCrawler/sources/PubChem.py @@ -12,10 +12,16 @@ class PubChem(Source): This parser parses the part on PubChem pages that gives Chemical and Physical properties of a substance. """ - # TO DO: make url variable with help of PubChem identifier ID given by Wikipedia + # TO DO: make url variable with help of PubChem identifier ID / cid #website = "https://pubchem.ncbi.nlm.nih.gov/summary/summary.cgi?cid=297" #contains name of compound but not all parsable data - website = "https://pubchem.ncbi.nlm.nih.gov/toc/summary_toc.cgi?tocid=27&cid=297" #contains properties to parse + # website = "https://pubchem.ncbi.nlm.nih.gov/toc/summary_toc.cgi?tocid=27&cid=297" #contains properties to parse + + + website = 'https://www.ncbi.nlm.nih.gov/*' + + + search = 'pccompound?term=%s' __spider = None searched_compounds = [] @@ -31,8 +37,10 @@ class PubChem(Source): if compound in self.searched_compounds: return None else: - items = self.parse_properties(sel) + # items = self.parse_properties(sel) + items = [] self.searched_compounds.append(compound) + print items return items def parse_properties(self, sel): @@ -68,7 +76,7 @@ class PubChem(Source): return items def new_compound_request(self, compound): - return Request(url=self.website[:-1] + compound, callback=self.parse) + return Request(url=self.website[:-1] + self.search % compound, callback=self.parse) # @staticmethod # def clean_items(items): From 4b377bb9a966e4b1fd82101e865d70fae0c30b1c Mon Sep 17 00:00:00 2001 From: Nout van Deijck Date: Wed, 21 May 2014 15:25:55 +0200 Subject: [PATCH 03/38] PubChem now scrapes its synonyms --- FourmiCrawler/sources/PubChem.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py index d34a2cb..0ce727f 100644 --- a/FourmiCrawler/sources/PubChem.py +++ b/FourmiCrawler/sources/PubChem.py @@ -19,12 +19,10 @@ class PubChem(Source): website = 'https://www.ncbi.nlm.nih.gov/*' - - search = 'pccompound?term=%s' __spider = None - searched_compounds = [] + searched_compounds = set() def __init__(self): Source.__init__(self) @@ -34,12 +32,21 @@ class PubChem(Source): log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG) sel = Selector(response) compound = sel.xpath('//h1/text()').extract()[0] + raw_synonyms = sel.xpath('//div[@class="smalltext"]/text()').extract()[0] + for synonym in raw_synonyms.strip().split(', '): + log.msg('PubChem synonym found: %s' % synonym, level=log.DEBUG) + self.searched_compounds.update(synonym) + self._spider.get_synonym_requests(synonym) + + + log.msg('Raw synonyms found: %s' % raw_synonyms, level=log.DEBUG) + if compound in self.searched_compounds: return None else: # items = self.parse_properties(sel) items = [] - self.searched_compounds.append(compound) + self.searched_compounds.update(compound) print items return items From fb41d772f203b420784582732ea64fd45d96c51d Mon Sep 17 00:00:00 2001 From: Nout van Deijck Date: Wed, 21 May 2014 16:11:02 +0200 Subject: [PATCH 04/38] Added custom user-agent because otherwise it would block, because not amused by scraper --- FourmiCrawler/settings.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/FourmiCrawler/settings.py b/FourmiCrawler/settings.py index be91fef..490a3a5 100644 --- a/FourmiCrawler/settings.py +++ b/FourmiCrawler/settings.py @@ -16,6 +16,8 @@ ITEM_PIPELINES = { FEED_URI = 'results.json' FEED_FORMAT = 'jsonlines' +USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36' + # Crawl responsibly by identifying yourself (and your website) on the # user-agent From 8083d0c7bc03459de2aab224a811653389aa0ebf Mon Sep 17 00:00:00 2001 From: Nout van Deijck Date: Wed, 21 May 2014 16:11:48 +0200 Subject: [PATCH 05/38] PubChem scrapes synonyms, gets custom url to get data on properties from --- FourmiCrawler/sources/PubChem.py | 40 ++++++++++++++++++++++---------- 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py index 0ce727f..e2dcc8b 100644 --- a/FourmiCrawler/sources/PubChem.py +++ b/FourmiCrawler/sources/PubChem.py @@ -18,8 +18,11 @@ class PubChem(Source): # website = "https://pubchem.ncbi.nlm.nih.gov/toc/summary_toc.cgi?tocid=27&cid=297" #contains properties to parse - website = 'https://www.ncbi.nlm.nih.gov/*' + website = 'https://*.ncbi.nlm.nih.gov/*' + website_www = 'https://www.ncbi.nlm.nih.gov/*' + website_pubchem = 'https://pubchem.ncbi.nlm.nih.gov/*' search = 'pccompound?term=%s' + data_url = 'toc/summary_toc.cgi?tocid=27&cid=%s' __spider = None searched_compounds = set() @@ -29,26 +32,39 @@ class PubChem(Source): def parse(self, response): """ Distributes the above described behaviour """ + requests = [] log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG) + sel = Selector(response) compound = sel.xpath('//h1/text()').extract()[0] + if compound in self.searched_compounds: + return None + + self.searched_compounds.update(compound) raw_synonyms = sel.xpath('//div[@class="smalltext"]/text()').extract()[0] for synonym in raw_synonyms.strip().split(', '): log.msg('PubChem synonym found: %s' % synonym, level=log.DEBUG) self.searched_compounds.update(synonym) self._spider.get_synonym_requests(synonym) - - log.msg('Raw synonyms found: %s' % raw_synonyms, level=log.DEBUG) - if compound in self.searched_compounds: - return None - else: - # items = self.parse_properties(sel) - items = [] - self.searched_compounds.update(compound) - print items - return items + n = re.search(r'cid=(\d+)',response.url) + if n: + cid = n.group(1) + log.msg('cid: %s' % cid, level=log.DEBUG) + requests.append(Request(url=self.website_pubchem[:-1] + self.data_url % cid, callback=self.parse_data)) + + return requests + + def parse_data(self, response): + log.msg('parsing data', level=log.DEBUG) + requests = [] + + + + + return requests + def parse_properties(self, sel): """ scrape data from 'Chemical and Physical Properties' box on PubChem. """ @@ -83,7 +99,7 @@ class PubChem(Source): return items def new_compound_request(self, compound): - return Request(url=self.website[:-1] + self.search % compound, callback=self.parse) + return Request(url=self.website_www[:-1] + self.search % compound, callback=self.parse) # @staticmethod # def clean_items(items): From ba8f8451786088c12b4645f61261ab4e8d96598b Mon Sep 17 00:00:00 2001 From: Nout van Deijck Date: Mon, 2 Jun 2014 09:26:36 +0200 Subject: [PATCH 06/38] now also (finally) scrapes property values and names, but not yet coupled together and not yet returned. --- FourmiCrawler/sources/PubChem.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py index e2dcc8b..6718900 100644 --- a/FourmiCrawler/sources/PubChem.py +++ b/FourmiCrawler/sources/PubChem.py @@ -60,12 +60,20 @@ class PubChem(Source): log.msg('parsing data', level=log.DEBUG) requests = [] + sel = Selector(response) + # props = sel.xpath('.//div') + prop_values = sel.xpath('//div//a/text()').extract() + prop_names = sel.xpath('//div//a/ancestor::div/b/text()').extract() + print prop_values + print prop_names + # print props return requests + # this (old) definition is only here to help myself def parse_properties(self, sel): """ scrape data from 'Chemical and Physical Properties' box on PubChem. """ items = [] @@ -95,9 +103,9 @@ class PubChem(Source): items = filter(lambda a: a['value'] != '', items) # remove items with an empty value # item_list = self.clean_items(items) - return items + def new_compound_request(self, compound): return Request(url=self.website_www[:-1] + self.search % compound, callback=self.parse) From 291547a5addfb5f79dd8bcc0cb80c798f20f05db Mon Sep 17 00:00:00 2001 From: Nout van Deijck Date: Wed, 4 Jun 2014 15:44:53 +0200 Subject: [PATCH 07/38] now returns good results, with property values and corresponding sources --- FourmiCrawler/sources/PubChem.py | 34 +++++++++++++++++++++++++------- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py index 6718900..1d20231 100644 --- a/FourmiCrawler/sources/PubChem.py +++ b/FourmiCrawler/sources/PubChem.py @@ -61,14 +61,34 @@ class PubChem(Source): requests = [] sel = Selector(response) - # props = sel.xpath('.//div') - prop_values = sel.xpath('//div//a/text()').extract() - prop_names = sel.xpath('//div//a/ancestor::div/b/text()').extract() + props = sel.xpath('//div') - print prop_values - print prop_names - - # print props + for prop in props: + prop_name = ''.join(prop.xpath('b/text()').extract()) + if prop.xpath('a'): + prop_source = ''.join(prop.xpath('a/@title').extract()) + prop_value = ''.join(prop.xpath('a/text()').extract()) + new_prop = Result({ + 'attribute': prop_name, + 'value': prop_value, + 'source': prop_source, + 'reliability': 'Unknown', + 'conditions': '' + }) + requests.append(new_prop) + elif prop.xpath('ul'): + prop_values = prop.xpath('ul//li') + for prop_li in prop_values: + prop_value = ''.join(prop_li.xpath('a/text()').extract()) + prop_source = ''.join(prop_li.xpath('a/@title').extract()) + new_prop = Result({ + 'attribute': prop_name, + 'value': prop_value, + 'source': prop_source, + 'reliability': 'Unknown', + 'conditions': '' + }) + requests.append(new_prop) return requests From f1047405667c789b1a1c4238ae84eeac10834cfe Mon Sep 17 00:00:00 2001 From: Nout van Deijck Date: Wed, 11 Jun 2014 16:39:00 +0200 Subject: [PATCH 08/38] cleaned up useless code --- FourmiCrawler/sources/PubChem.py | 54 +------------------------------- 1 file changed, 1 insertion(+), 53 deletions(-) diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py index 1d20231..6490b20 100644 --- a/FourmiCrawler/sources/PubChem.py +++ b/FourmiCrawler/sources/PubChem.py @@ -12,12 +12,6 @@ class PubChem(Source): This parser parses the part on PubChem pages that gives Chemical and Physical properties of a substance. """ - # TO DO: make url variable with help of PubChem identifier ID / cid - - #website = "https://pubchem.ncbi.nlm.nih.gov/summary/summary.cgi?cid=297" #contains name of compound but not all parsable data - # website = "https://pubchem.ncbi.nlm.nih.gov/toc/summary_toc.cgi?tocid=27&cid=297" #contains properties to parse - - website = 'https://*.ncbi.nlm.nih.gov/*' website_www = 'https://www.ncbi.nlm.nih.gov/*' website_pubchem = 'https://pubchem.ncbi.nlm.nih.gov/*' @@ -93,51 +87,5 @@ class PubChem(Source): return requests - # this (old) definition is only here to help myself - def parse_properties(self, sel): - """ scrape data from 'Chemical and Physical Properties' box on PubChem. """ - items = [] - - - prop_names = sel.xpath('.//div[@id="d27"//div/b').\ - xpath('normalize-space(string())') - prop_values = sel.xpath('.//div[@id="d27"//div/a').\ - xpath('normalize-space(string())') - prop_sources = sel.xpath('.//div[@id="d27"//div/a[@title]').\ - xpath('normalize-space(string())') - - for i, prop_name in enumerate(prop_names): - item = Result({ - 'attribute': prop_name.extract().encode('utf-8'), - 'value': prop_values[i].extract().encode('utf-8'), - 'source': "PubChem: " + prop_sources[i].extract().encode('utf-8'), - 'reliability': "", - 'conditions': "" - }) - items.append(item) - - print item - - log.msg('PubChem prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG) - - items = filter(lambda a: a['value'] != '', items) # remove items with an empty value - # item_list = self.clean_items(items) - - return items - - def new_compound_request(self, compound): - return Request(url=self.website_www[:-1] + self.search % compound, callback=self.parse) - - # @staticmethod - # def clean_items(items): - # """ clean up properties using regex, makes it possible to split the values from the units """ - # for item in items: - # value = item['value'] - # m = re.search('F;\s(\d+[\.,]?\d*)', value) # clean up numerical Kelvin value (after F) - # if m: - # item['value'] = m.group(1) + " K" - # m = re.match('(\d+[\.,]?\d*)\sJ\sK.+mol', value) # clean up J/K/mol values - # if m: - # item['value'] = m.group(1) + " J/K/mol" - # return items + return Request(url=self.website_www[:-1] + self.search % compound, callback=self.parse) \ No newline at end of file From a903e78f9ebe4f855c9ffc0d74ce4faa95831c4f Mon Sep 17 00:00:00 2001 From: Nout van Deijck Date: Wed, 11 Jun 2014 16:40:32 +0200 Subject: [PATCH 09/38] added PubChem to sources.cfg --- sources.cfg | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 sources.cfg diff --git a/sources.cfg b/sources.cfg new file mode 100644 index 0000000..a9fa2fb --- /dev/null +++ b/sources.cfg @@ -0,0 +1,15 @@ +[DEFAULT] +reliability = Unknown + +[ChemSpider] +reliability = High +token = 052bfd06-5ce4-43d6-bf12-89eabefd2338 + +[NIST] +reliability = High + +[WikipediaParser] +reliability = Medium + +[PubChem] +reliability = High \ No newline at end of file From 8836cdf16b758b86bc1e20402b85b2c3d4b11990 Mon Sep 17 00:00:00 2001 From: RTB Date: Wed, 11 Jun 2014 18:39:01 +0200 Subject: [PATCH 10/38] fixed config errors due to merge with develop --- FourmiCrawler/sources/PubChem.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py index 6490b20..ab6a99e 100644 --- a/FourmiCrawler/sources/PubChem.py +++ b/FourmiCrawler/sources/PubChem.py @@ -21,8 +21,9 @@ class PubChem(Source): __spider = None searched_compounds = set() - def __init__(self): - Source.__init__(self) + def __init__(self, config): + Source.__init__(self, config) + self.cfg = config def parse(self, response): """ Distributes the above described behaviour """ @@ -88,4 +89,4 @@ class PubChem(Source): def new_compound_request(self, compound): - return Request(url=self.website_www[:-1] + self.search % compound, callback=self.parse) \ No newline at end of file + return Request(url=self.website_www[:-1] + self.search % compound, callback=self.parse) From ee7f1ab739a4b3004635914a02c14baa5b5510b5 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 15 Jun 2014 19:26:13 +0200 Subject: [PATCH 11/38] Updated the Objectives and linkage to the wiki --- README.md | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 48b0419..f09f77c 100644 --- a/README.md +++ b/README.md @@ -23,21 +23,21 @@ documentation](http://doc.scrapy.org/en/latest/index.html). ### Installing -If you're installing Fourmi, please take a look at our [installation guide](...) -on our wiki. When you've installed the application, make sure to check our -[usage guide](...). +If you're installing Fourmi, please take a look at our installation guides +on our [wiki](https://github.com/jjdekker/Fourmi/wiki). When you've installed the application, make sure to check our +usage guide on the [Command Line Interface](https://github.com/jjdekker/Fourmi/wiki/CLI) and on the [Graphical User Interface](https://github.com/jjdekker/Fourmi/wiki/GUI). ### Using the Source To use the Fourmi source code multiple dependencies are required. Take a look at -the [wiki page](...) on using the application source code for a step by step +our [wiki pages](https://github.com/jjdekker/Fourmi/wiki) on using the application source code in our a step by step installation guide. When developing for the Fourmi project keep in mind that code readability is a must. To maintain the readability, code should be conform with the [PEP-8](http://legacy.python.org/dev/peps/pep-0008/) style guide for Python code. More information about the different structures and principles of the -Fourmi application can be found on our [wiki](...). +Fourmi application can be found on our [wiki](https://github.com/jjdekker/Fourmi/wiki). ### To Do @@ -45,13 +45,9 @@ The Fourmi project has the following goals for the nearby future: __Main goals:__ -- Improve our documentation and guides. (Assignee: Dekker) - Build an graphical user interface(GUI) as alternative for the command line interface(CLI). (Assignee: Harmen) - Compiling the source into an windows executable. (Assignee: Bas) -- Create an configuration file to hold logins and API keys. -- Determine reliability of our data point. -- Create an module to gather data from NIST. (Assignee: Rob) - Create an module to gather data from PubChem. (Assignee: Nout) __Side goals:__ From 2eb8f3e0af18cad1adafeb8d6e2783b483539c35 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 15 Jun 2014 19:38:52 +0200 Subject: [PATCH 12/38] Changed logging CL option --- fourmi.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fourmi.py b/fourmi.py index e6d7e9a..ab4baef 100755 --- a/fourmi.py +++ b/fourmi.py @@ -5,6 +5,7 @@ Fourmi, a web scraper build to search specific information for a given compound Usage: fourmi search fourmi [options] search + fourmi [-v | -vv | -vvv] [options] search fourmi [options] [--include= | --exclude=] search fourmi list fourmi [--include= | --exclude=] list @@ -15,7 +16,7 @@ Options: --attributes= Include only that match these regular expressions split by a comma. [default: .*] -h --help Show this screen. --version Show version. - --verbose Verbose logging output. + -v Verbose logging output. (Multiple occurrences increase logging level) --log= Save log to an file. -o --output= Output file [default: results.*format*] -f --format= Output formats (supported: csv, json, jsonlines, xml) [default: csv] @@ -25,8 +26,7 @@ Options: from twisted.internet import reactor from scrapy.crawler import Crawler -from scrapy import log, signals -from scrapy.utils.project import get_project_settings +from scrapy import signals import docopt from FourmiCrawler.spider import FourmiSpider @@ -69,6 +69,8 @@ if __name__ == '__main__': arguments = docopt.docopt(__doc__, version='Fourmi - V0.5.0') loader = SourceLoader() + print arguments["-v"] + if arguments["--include"]: loader.include(arguments["--include"].split(',')) elif arguments["--exclude"]: From 4672903c9b9b39a3b64cb3f56e1c5530f89890ae Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 15 Jun 2014 19:50:31 +0200 Subject: [PATCH 13/38] The logging now using the scrapy setting overrides --- fourmi.py | 4 +--- utils/configurator.py | 39 +++++++++++++++++++++++++++------------ 2 files changed, 28 insertions(+), 15 deletions(-) diff --git a/fourmi.py b/fourmi.py index ab4baef..1fd54e7 100755 --- a/fourmi.py +++ b/fourmi.py @@ -58,7 +58,7 @@ def search(docopt_arguments, source_loader): :param source_loader: An initiated SourceLoader object pointed at the directory with the sources. """ conf = Configurator() - conf.start_log(docopt_arguments["--log"], docopt_arguments["--verbose"]) + conf.start_log(docopt_arguments["--log"], docopt_arguments["-v"]) conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"]) setup_crawler(docopt_arguments[""], conf.scrapy_settings, source_loader, docopt_arguments["--attributes"].split(',')) reactor.run() @@ -69,8 +69,6 @@ if __name__ == '__main__': arguments = docopt.docopt(__doc__, version='Fourmi - V0.5.0') loader = SourceLoader() - print arguments["-v"] - if arguments["--include"]: loader.include(arguments["--include"].split(',')) elif arguments["--exclude"]: diff --git a/utils/configurator.py b/utils/configurator.py index dfc6330..25a4883 100644 --- a/utils/configurator.py +++ b/utils/configurator.py @@ -1,6 +1,8 @@ +import ConfigParser + from scrapy import log from scrapy.utils.project import get_project_settings -import ConfigParser + class Configurator: """ @@ -33,20 +35,33 @@ class Configurator: def start_log(self, logfile, verbose): """ - This function starts the logging functionality of Scrapy using the settings given by the CLI. + This function changes the default settings of Scapy's logging functionality + using the settings given by the CLI. :param logfile: The location where the logfile will be saved. - :param verbose: A boolean value to switch between loglevels. + :param verbose: A integer value to switch between loglevels. """ - if logfile is not None: - if verbose: - log.start(logfile=logfile, logstdout=False, loglevel=log.DEBUG) - else: - log.start(logfile=logfile, logstdout=True, loglevel=log.WARNING) + if verbose != 0: + self.scrapy_settings.overrides["LOG_ENABLED"] = True else: - if verbose: - log.start(logstdout=False, loglevel=log.DEBUG) - else: - log.start(logstdout=True, loglevel=log.WARNING) + self.scrapy_settings.overrides["LOG_ENABLED"] = False + + if verbose == 1: + self.scrapy_settings.overrides["LOG_LEVEL"] = "WARNING" + elif verbose == 2: + self.scrapy_settings.overrides["LOG_LEVEL"] = "INFO" + else: + self.scrapy_settings.overrides["LOG_LEVEL"] = "DEBUG" + + if verbose > 1: + self.scrapy_settings.overrides["LOG_STDOUT"] = False + else: + self.scrapy_settings.overrides["LOG_STDOUT"] = True + + if logfile is not None: + self.scrapy_settings.overrides["LOG_FILE"] = logfile + else: + self.scrapy_settings.overrides["LOG_FILE"] = None + @staticmethod def read_sourceconfiguration(): From f604c3efcc62b39b139651b440ce46761204a0d9 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 15 Jun 2014 20:07:11 +0200 Subject: [PATCH 14/38] Utils can't use the logging facilities as they aren't started yet --- utils/configurator.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/utils/configurator.py b/utils/configurator.py index 25a4883..7c1aaa8 100644 --- a/utils/configurator.py +++ b/utils/configurator.py @@ -1,6 +1,5 @@ import ConfigParser -from scrapy import log from scrapy.utils.project import get_project_settings @@ -90,7 +89,6 @@ class Configurator: elif config.defaults(): section = config.defaults() if 'reliability' not in section: - log.msg('Reliability not set for %s' % sourcename, - level=log.WARNING) + print 'Reliability not set for %s' % sourcename section['reliability'] = '' return section From 3ea950b93662d741f2b0d971ba43f3c9804c55eb Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 15 Jun 2014 20:09:40 +0200 Subject: [PATCH 15/38] Logging facility is working again. --- fourmi.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fourmi.py b/fourmi.py index 1fd54e7..95fc53a 100755 --- a/fourmi.py +++ b/fourmi.py @@ -26,7 +26,7 @@ Options: from twisted.internet import reactor from scrapy.crawler import Crawler -from scrapy import signals +from scrapy import signals, log import docopt from FourmiCrawler.spider import FourmiSpider @@ -61,6 +61,7 @@ def search(docopt_arguments, source_loader): conf.start_log(docopt_arguments["--log"], docopt_arguments["-v"]) conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"]) setup_crawler(docopt_arguments[""], conf.scrapy_settings, source_loader, docopt_arguments["--attributes"].split(',')) + log.start(conf.scrapy_settings.get("LOG_FILE"), conf.scrapy_settings.get("LOG_LEVEL"), conf.scrapy_settings.get("LOG_STDOUT")) reactor.run() From 3fe2cde892ba1889d7d845a71c2e41a8037781be Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 15 Jun 2014 20:10:17 +0200 Subject: [PATCH 16/38] Error message clearly labeled as a warning --- utils/configurator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/configurator.py b/utils/configurator.py index 7c1aaa8..5cde4d5 100644 --- a/utils/configurator.py +++ b/utils/configurator.py @@ -89,6 +89,6 @@ class Configurator: elif config.defaults(): section = config.defaults() if 'reliability' not in section: - print 'Reliability not set for %s' % sourcename + print 'WARNING: Reliability not set for %s' % sourcename section['reliability'] = '' return section From e3d6087ed43e6c38ab6c156ea9926447e7867028 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 15 Jun 2014 20:12:23 +0200 Subject: [PATCH 17/38] renamed logging function --- tests/test_configurator.py | 11 +++++------ utils/configurator.py | 2 +- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/tests/test_configurator.py b/tests/test_configurator.py index eb43cb7..cf54132 100644 --- a/tests/test_configurator.py +++ b/tests/test_configurator.py @@ -1,7 +1,8 @@ import unittest +import ConfigParser + from utils.configurator import Configurator -import ConfigParser class TestConfigurator(unittest.TestCase): @@ -21,11 +22,9 @@ class TestConfigurator(unittest.TestCase): self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.csv") self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv") - # def test_start_log(self): - # self.conf.start_log("test.log", True) - # self.conf.start_log("test.log", False) - # self.conf.start_log(None, True) - # self.conf.start_log(None, False) + def test_start_log(self): + for i in range(0 ,3): + self.conf.set_logging() def test_read_sourceconfiguration(self): config = self.conf.read_sourceconfiguration() diff --git a/utils/configurator.py b/utils/configurator.py index 5cde4d5..03ef38f 100644 --- a/utils/configurator.py +++ b/utils/configurator.py @@ -32,7 +32,7 @@ class Configurator: self.scrapy_settings.overrides["FEED_FORMAT"] = fileformat - def start_log(self, logfile, verbose): + def set_logging(self, logfile, verbose): """ This function changes the default settings of Scapy's logging functionality using the settings given by the CLI. From 435356c3212e5f6656fd7f560217c398f2a26d16 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 15 Jun 2014 20:32:24 +0200 Subject: [PATCH 18/38] Added default values to the logging function --- fourmi.py | 2 +- utils/configurator.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fourmi.py b/fourmi.py index 95fc53a..1b9237c 100755 --- a/fourmi.py +++ b/fourmi.py @@ -58,7 +58,7 @@ def search(docopt_arguments, source_loader): :param source_loader: An initiated SourceLoader object pointed at the directory with the sources. """ conf = Configurator() - conf.start_log(docopt_arguments["--log"], docopt_arguments["-v"]) + conf.set_logging(docopt_arguments["--log"], docopt_arguments["-v"]) conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"]) setup_crawler(docopt_arguments[""], conf.scrapy_settings, source_loader, docopt_arguments["--attributes"].split(',')) log.start(conf.scrapy_settings.get("LOG_FILE"), conf.scrapy_settings.get("LOG_LEVEL"), conf.scrapy_settings.get("LOG_STDOUT")) diff --git a/utils/configurator.py b/utils/configurator.py index 03ef38f..7dc27c5 100644 --- a/utils/configurator.py +++ b/utils/configurator.py @@ -32,7 +32,7 @@ class Configurator: self.scrapy_settings.overrides["FEED_FORMAT"] = fileformat - def set_logging(self, logfile, verbose): + def set_logging(self, logfile=None, verbose=0): """ This function changes the default settings of Scapy's logging functionality using the settings given by the CLI. From fa42562b8e63bc049cac5a8769b02f7dd72a97c1 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 15 Jun 2014 20:33:58 +0200 Subject: [PATCH 19/38] Tests for the Logging Functionality --- tests/test_configurator.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/tests/test_configurator.py b/tests/test_configurator.py index cf54132..df29da9 100644 --- a/tests/test_configurator.py +++ b/tests/test_configurator.py @@ -23,8 +23,27 @@ class TestConfigurator(unittest.TestCase): self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv") def test_start_log(self): - for i in range(0 ,3): - self.conf.set_logging() + for i in range(0, 3): + self.conf.set_logging("TEST", i) + self.assertEqual(self.conf.scrapy_settings.get("LOG_FILE"), "TEST") + if i > 0: + self.assertEqual(self.conf.scrapy_settings.get("LOG_ENABLED"), True) + if i > 1: + self.assertEqual(self.conf.scrapy_settings.get("LOG_STDOUT"), False) + else: + self.assertEqual(self.conf.scrapy_settings.get("LOG_STDOUT"), True) + else: + self.assertEqual(self.conf.scrapy_settings.get("LOG_ENABLED"), False) + self.assertEqual(self.conf.scrapy_settings.get("LOG_STDOUT"), True) + if i == 1: + self.assertEqual(self.conf.scrapy_settings.get("LOG_LEVEL"), "WARNING") + elif i == 2: + self.assertEqual(self.conf.scrapy_settings.get("LOG_LEVEL"), "INFO") + elif i == 3: + self.assertEqual(self.conf.scrapy_settings.get("LOG_LEVEL"), "DEBUG") + + self.conf.set_logging(verbose=i) + self.assertEqual(self.conf.scrapy_settings.get("LOG_FILE"), None) def test_read_sourceconfiguration(self): config = self.conf.read_sourceconfiguration() From 66f2384747a5a86aba034729f532794e7c06e8fe Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 15 Jun 2014 20:41:19 +0200 Subject: [PATCH 20/38] Default arguments can't be mutable --- FourmiCrawler/sources/ChemSpider.py | 7 +++++-- FourmiCrawler/sources/NIST.py | 9 +++++---- FourmiCrawler/sources/WikipediaParser.py | 14 +++++++++----- FourmiCrawler/sources/source.py | 2 +- FourmiCrawler/spider.py | 7 +++++-- 5 files changed, 25 insertions(+), 14 deletions(-) diff --git a/FourmiCrawler/sources/ChemSpider.py b/FourmiCrawler/sources/ChemSpider.py index 87a6ee7..fb51a4a 100644 --- a/FourmiCrawler/sources/ChemSpider.py +++ b/FourmiCrawler/sources/ChemSpider.py @@ -26,9 +26,12 @@ class ChemSpider(Source): structure = 'Chemical-Structure.%s.html' extendedinfo = 'MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token=' - def __init__(self, config={}): + def __init__(self, config=None): Source.__init__(self, config) - self.cfg = config + if self.cfg is None: + self.cfg = {} + else: + self.cfg = config self.ignore_list = [] if 'token' not in self.cfg or self.cfg['token'] == '': log.msg('ChemSpider token not set or empty, search/MassSpec API ' diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py index 3c323ef..d71d08f 100644 --- a/FourmiCrawler/sources/NIST.py +++ b/FourmiCrawler/sources/NIST.py @@ -22,12 +22,13 @@ class NIST(Source): search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on' - cfg = {} - - def __init__(self, config={}): + def __init__(self, config=None): Source.__init__(self, config) self.ignore_list = set() - self.cfg = config + if config is None: + self.cfg = {} + else: + self.cfg = config def parse(self, response): sel = Selector(response) diff --git a/FourmiCrawler/sources/WikipediaParser.py b/FourmiCrawler/sources/WikipediaParser.py index 4aa49b2..b995f30 100644 --- a/FourmiCrawler/sources/WikipediaParser.py +++ b/FourmiCrawler/sources/WikipediaParser.py @@ -1,9 +1,11 @@ +import re + from scrapy.http import Request from scrapy import log -from source import Source from scrapy.selector import Selector + +from source import Source from FourmiCrawler.items import Result -import re class WikipediaParser(Source): @@ -17,11 +19,13 @@ class WikipediaParser(Source): __spider = None searched_compounds = [] - cfg = {} - def __init__(self, config={}): + def __init__(self, config=None): Source.__init__(self, config) - self.cfg = config + if config is None: + self.cfg = {} + else: + self.cfg = config def parse(self, response): """ diff --git a/FourmiCrawler/sources/source.py b/FourmiCrawler/sources/source.py index a609bb9..fe36784 100644 --- a/FourmiCrawler/sources/source.py +++ b/FourmiCrawler/sources/source.py @@ -6,7 +6,7 @@ class Source: website = "http://something/*" # Regex of URI's the source is able to parse _spider = None - def __init__(self, config={}): + def __init__(self, config=None): """ Initiation of a new Source """ diff --git a/FourmiCrawler/spider.py b/FourmiCrawler/spider.py index 5c09f07..7552c7d 100644 --- a/FourmiCrawler/spider.py +++ b/FourmiCrawler/spider.py @@ -10,7 +10,7 @@ class FourmiSpider(Spider): """ name = "FourmiSpider" - def __init__(self, compound=None, selected_attributes=[".*"], *args, **kwargs): + def __init__(self, compound=None, selected_attributes=None, *args, **kwargs): """ Initiation of the Spider :param compound: compound that will be searched. @@ -20,7 +20,10 @@ class FourmiSpider(Spider): self.synonyms = set() super(FourmiSpider, self).__init__(*args, **kwargs) self.synonyms.add(compound) - self.selected_attributes = selected_attributes + if selected_attributes is None: + self.selected_attributes = [".*"] + else: + self.selected_attributes = selected_attributes def parse(self, response): """ From 74e7152d5fc2d35ad109f2660bb2385cdc04526d Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 15 Jun 2014 20:45:35 +0200 Subject: [PATCH 21/38] A lot of PEP-8 fixes --- FourmiCrawler/sources/ChemSpider.py | 6 +++--- FourmiCrawler/sources/NIST.py | 14 +++++++------- FourmiCrawler/sources/WikipediaParser.py | 17 ++++++++--------- FourmiCrawler/spider.py | 2 +- fourmi.py | 6 ++++-- tests/test_spider.py | 1 - utils/configurator.py | 5 +---- utils/sourceloader.py | 1 + 8 files changed, 25 insertions(+), 27 deletions(-) diff --git a/FourmiCrawler/sources/ChemSpider.py b/FourmiCrawler/sources/ChemSpider.py index fb51a4a..3f1538f 100644 --- a/FourmiCrawler/sources/ChemSpider.py +++ b/FourmiCrawler/sources/ChemSpider.py @@ -40,7 +40,6 @@ class ChemSpider(Source): self.search += self.cfg['token'] self.extendedinfo += self.cfg['token'] - def parse(self, response): sel = Selector(response) requests = [] @@ -202,13 +201,14 @@ class ChemSpider(Source): return properties def newresult(self, attribute, value, conditions='', source='ChemSpider'): - return Result({ + return Result( + { 'attribute': attribute, 'value': value, 'source': source, 'reliability': self.cfg['reliability'], 'conditions': conditions - }) + }) def parse_searchrequest(self, response): """Parse the initial response of the ChemSpider Search API """ diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py index d71d08f..e81db5a 100644 --- a/FourmiCrawler/sources/NIST.py +++ b/FourmiCrawler/sources/NIST.py @@ -89,7 +89,6 @@ class NIST(Source): InChiKey, CAS number """ ul = sel.xpath('body/ul[li/strong="IUPAC Standard InChI:"]') - li = ul.xpath('li') raw_synonyms = ul.xpath('li[strong="Other names:"]/text()').extract() for synonym in raw_synonyms[0].strip().split(';\n'): @@ -256,12 +255,13 @@ class NIST(Source): return results def newresult(self, attribute, value, conditions=''): - return Result({ - 'attribute': attribute, - 'value': value, - 'source': 'NIST', - 'reliability': self.cfg['reliability'], - 'conditions': conditions + return Result( + { + 'attribute': attribute, + 'value': value, + 'source': 'NIST', + 'reliability': self.cfg['reliability'], + 'conditions': conditions }) def new_compound_request(self, compound): diff --git a/FourmiCrawler/sources/WikipediaParser.py b/FourmiCrawler/sources/WikipediaParser.py index b995f30..cfd2555 100644 --- a/FourmiCrawler/sources/WikipediaParser.py +++ b/FourmiCrawler/sources/WikipediaParser.py @@ -19,7 +19,6 @@ class WikipediaParser(Source): __spider = None searched_compounds = [] - def __init__(self, config=None): Source.__init__(self, config) if config is None: @@ -57,7 +56,7 @@ class WikipediaParser(Source): # scrape the chembox (wikipedia template) items = self.parse_chembox(sel, items) - #scrape the drugbox (wikipedia template) + # scrape the drugbox (wikipedia template) items = self.parse_drugbox(sel, items) items = filter(lambda a: a['value'] != '', items) # remove items with an empty value @@ -127,7 +126,6 @@ class WikipediaParser(Source): level=log.DEBUG) return items - def new_compound_request(self, compound): return Request(url=self.website[:-1] + compound, callback=self.parse) @@ -165,10 +163,11 @@ class WikipediaParser(Source): return links def newresult(self, attribute, value): - return Result({ - 'attribute': attribute, - 'value': value, - 'source': 'Wikipedia', - 'reliability': self.cfg['reliability'], - 'conditions': '' + return Result( + { + 'attribute': attribute, + 'value': value, + 'source': 'Wikipedia', + 'reliability': self.cfg['reliability'], + 'conditions': '' }) diff --git a/FourmiCrawler/spider.py b/FourmiCrawler/spider.py index 7552c7d..ebfd2cf 100644 --- a/FourmiCrawler/spider.py +++ b/FourmiCrawler/spider.py @@ -21,7 +21,7 @@ class FourmiSpider(Spider): super(FourmiSpider, self).__init__(*args, **kwargs) self.synonyms.add(compound) if selected_attributes is None: - self.selected_attributes = [".*"] + self.selected_attributes = [".*"] else: self.selected_attributes = selected_attributes diff --git a/fourmi.py b/fourmi.py index 1b9237c..2a422ef 100755 --- a/fourmi.py +++ b/fourmi.py @@ -60,8 +60,10 @@ def search(docopt_arguments, source_loader): conf = Configurator() conf.set_logging(docopt_arguments["--log"], docopt_arguments["-v"]) conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"]) - setup_crawler(docopt_arguments[""], conf.scrapy_settings, source_loader, docopt_arguments["--attributes"].split(',')) - log.start(conf.scrapy_settings.get("LOG_FILE"), conf.scrapy_settings.get("LOG_LEVEL"), conf.scrapy_settings.get("LOG_STDOUT")) + setup_crawler(docopt_arguments[""], conf.scrapy_settings, + source_loader, docopt_arguments["--attributes"].split(',')) + log.start(conf.scrapy_settings.get("LOG_FILE"), + conf.scrapy_settings.get("LOG_LEVEL"), conf.scrapy_settings.get("LOG_STDOUT")) reactor.run() diff --git a/tests/test_spider.py b/tests/test_spider.py index 589a571..1ee40b1 100644 --- a/tests/test_spider.py +++ b/tests/test_spider.py @@ -47,7 +47,6 @@ class TestFoumiSpider(unittest.TestCase): self.assertGreater(len(requests), 0) self.assertIsInstance(requests[0], Request) - def test_synonym_requests(self): # A test for the synonym request function self.spi._sources = [] diff --git a/utils/configurator.py b/utils/configurator.py index 7dc27c5..62987c6 100644 --- a/utils/configurator.py +++ b/utils/configurator.py @@ -12,7 +12,6 @@ class Configurator: def __init__(self): self.scrapy_settings = get_project_settings() - def set_output(self, filename, fileformat): """ This function manipulates the Scrapy output file settings that normally would be set in the settings file. @@ -31,7 +30,6 @@ class Configurator: if fileformat is not None: self.scrapy_settings.overrides["FEED_FORMAT"] = fileformat - def set_logging(self, logfile=None, verbose=0): """ This function changes the default settings of Scapy's logging functionality @@ -61,7 +59,6 @@ class Configurator: else: self.scrapy_settings.overrides["LOG_FILE"] = None - @staticmethod def read_sourceconfiguration(): """ @@ -70,7 +67,7 @@ class Configurator: :return a ConfigParser object of sources.cfg """ config = ConfigParser.ConfigParser() - config.read('sources.cfg') # [TODO]: should be softcoded eventually + config.read('sources.cfg') # [TODO]: should be softcoded eventually return config @staticmethod diff --git a/utils/sourceloader.py b/utils/sourceloader.py index 9b33657..8c54464 100644 --- a/utils/sourceloader.py +++ b/utils/sourceloader.py @@ -5,6 +5,7 @@ import re from FourmiCrawler.sources.source import Source from utils.configurator import Configurator + class SourceLoader: sources = [] From 79cf15b95c30cd937a9394df92ed87a3a635c07e Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 15 Jun 2014 20:50:00 +0200 Subject: [PATCH 22/38] Refractoring double code --- FourmiCrawler/sources/ChemSpider.py | 4 ---- FourmiCrawler/sources/NIST.py | 4 ---- FourmiCrawler/sources/WikipediaParser.py | 4 ---- FourmiCrawler/sources/source.py | 3 +++ 4 files changed, 3 insertions(+), 12 deletions(-) diff --git a/FourmiCrawler/sources/ChemSpider.py b/FourmiCrawler/sources/ChemSpider.py index 3f1538f..0110e57 100644 --- a/FourmiCrawler/sources/ChemSpider.py +++ b/FourmiCrawler/sources/ChemSpider.py @@ -28,10 +28,6 @@ class ChemSpider(Source): def __init__(self, config=None): Source.__init__(self, config) - if self.cfg is None: - self.cfg = {} - else: - self.cfg = config self.ignore_list = [] if 'token' not in self.cfg or self.cfg['token'] == '': log.msg('ChemSpider token not set or empty, search/MassSpec API ' diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py index e81db5a..934b457 100644 --- a/FourmiCrawler/sources/NIST.py +++ b/FourmiCrawler/sources/NIST.py @@ -25,10 +25,6 @@ class NIST(Source): def __init__(self, config=None): Source.__init__(self, config) self.ignore_list = set() - if config is None: - self.cfg = {} - else: - self.cfg = config def parse(self, response): sel = Selector(response) diff --git a/FourmiCrawler/sources/WikipediaParser.py b/FourmiCrawler/sources/WikipediaParser.py index cfd2555..401698c 100644 --- a/FourmiCrawler/sources/WikipediaParser.py +++ b/FourmiCrawler/sources/WikipediaParser.py @@ -21,10 +21,6 @@ class WikipediaParser(Source): def __init__(self, config=None): Source.__init__(self, config) - if config is None: - self.cfg = {} - else: - self.cfg = config def parse(self, response): """ diff --git a/FourmiCrawler/sources/source.py b/FourmiCrawler/sources/source.py index fe36784..36218b0 100644 --- a/FourmiCrawler/sources/source.py +++ b/FourmiCrawler/sources/source.py @@ -10,6 +10,9 @@ class Source: """ Initiation of a new Source """ + self.cfg = {} + if config is not None: + self.cfg = config pass def parse(self, response): From 147b148dbdfa102de5b1b6d002480cb6acfca39d Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 15 Jun 2014 21:00:36 +0200 Subject: [PATCH 23/38] Force a attribute of the test item to be None --- tests/test_pipeline.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index dfb8e83..eb2b070 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -13,6 +13,7 @@ class TestPipelines(unittest.TestCase): def test_none_pipeline(self): # Testing the pipeline that replaces the None values in items. self.testItem["value"] = "abc" + self.testItem["source"] = None pipe = pipelines.RemoveNonePipeline() processed = pipe.process_item(self.testItem, spider.FourmiSpider()) From a27e1e4bdd30b402a2c0ec99f8556c777bf57197 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 15 Jun 2014 21:09:43 +0200 Subject: [PATCH 24/38] Bumped version number --- fourmi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fourmi.py b/fourmi.py index 2a422ef..55a3c20 100755 --- a/fourmi.py +++ b/fourmi.py @@ -69,7 +69,7 @@ def search(docopt_arguments, source_loader): # The start for the Fourmi Command Line interface. if __name__ == '__main__': - arguments = docopt.docopt(__doc__, version='Fourmi - V0.5.0') + arguments = docopt.docopt(__doc__, version='Fourmi - V0.5.1') loader = SourceLoader() if arguments["--include"]: From 5f3ade8ff9f29d36e86a6e5d6b598cdc9870d60e Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 15 Jun 2014 21:11:30 +0200 Subject: [PATCH 25/38] Added a changelog --- Changelog.md | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 Changelog.md diff --git a/Changelog.md b/Changelog.md new file mode 100644 index 0000000..2a63786 --- /dev/null +++ b/Changelog.md @@ -0,0 +1,3 @@ +### v0.5.1 +- UPDATED: Logging functionality from command line +- DEV: Code cleanup and extra tests \ No newline at end of file From 9c9aba55d8a32ba716d83ad7aa1f5816db61fe63 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 15 Jun 2014 21:12:19 +0200 Subject: [PATCH 26/38] Added my signature, confirming validity of current files --- SIGNED.md | 103 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 103 insertions(+) create mode 100644 SIGNED.md diff --git a/SIGNED.md b/SIGNED.md new file mode 100644 index 0000000..79e66cb --- /dev/null +++ b/SIGNED.md @@ -0,0 +1,103 @@ +##### Signed by https://keybase.io/jdekker +``` +-----BEGIN PGP SIGNATURE----- +Version: GnuPG v1.4.11 (GNU/Linux) + +iQIcBAABAgAGBQJTnfAAAAoJEJrQ9RIUCT6/KZIQAME07yzAG5hnqsQof5ESoeQs +5wBxAhiBIX/0yn3qIT/eMh0ubCKUZsqJ3/PzUljeMJ6CGtwxFYfTWkgjYlOoAz9G +fS7CjPmRPyiu+MFo5he+oVRmLUMqfuLUrCyuIxJwMXq5YbQvzyqiffvxr8VRULtV +3c0drWfQMX1ZeAWSIYN0xuMndzvaqIAQU6o4tSQf/rUiKlM2NnTDNUHu2PY9FED/ +IJwM/IgAMAkJARyL7ltq6pHzORsu7sd2Nhv0esa0Gs2GSuRjKueeMZvJzpDAufy9 +bWn9EqKhVwPR6zWnXRmNj9Ymj1w167hIUYcBdFhC7kie5zv9+pDE6d/s7pw/Rejd +L0k8LKBGtJ8o7SKYR9kcNLDWXEnHjfCraD+14FMYqQPcz2ekoV6Exv/mP8qRPwUc +b+FtjJtW8fEiOMAyjMOvLTzYbCVwjdErAqgNdHeSByi1nxfrphjajRiNUt7fVimJ +++QZzKCj6xN2MuTJ41KbZ8teiUXwQB4OKKij0fgoy0RBwW0vqH6MF7cCKm1zT1Qa +9FGlBU2jSybQqUu4lJ/eUjO/3tQMhJErQJU/i+6lwi7OMnS9J/g17Heghp5Hxyhc +VWvhR56pbWLIL2XQqDGGEqPDIzXohHnbRJ1N71b06akIvIIrTqc6Glu4PJeUG/Pe +EF8/jBwydxbKUOyKRSQS +=xWbc +-----END PGP SIGNATURE----- + +``` + + + +### Begin signed statement + +#### Expect + +``` +size exec file contents + ./ +17591 .coverage 1dd1207846db74e407d3a4a1951b8e81934a4693385d39f6c337a224375bad39|1b7ead09cf213b5a9545557be982aaa30238b689bb54adf604f82b12ef521eb2 +375 .gitignore d2e475a6a4fa51422cac0a07495914e776858fb9ab9c8937a4d491a3e042d6b1 +464 .travis.yml 3063ba078607b8d16bd6467afc15fbbaa4b26c1e30be5ce7cef453cfccbaa95c +97 Changelog.md bcbce9a33bbbbcd18fd7788e6dc3a9c4b13dff7128ea99968994c1b290ddc931 + FourmiCrawler/ +0 __init__.py e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 +304 items.py b00d49a3d53fa13306c7f8b023adb93ab88423c4fce46600689814f6b02bb806 +2178 pipelines.py f9b7b84938060751e15e45de5133dffe50c798bff2a20019206fe7c9d677ad49 +716 settings.py 37a8f63e123bccc77076d574617a522b30c1d7c5e893ec3d78cc40e1563dd8a6 + sources/ +9991 ChemSpider.py 847013e34c5c3683ec66a337837287512b4bab9fbea2ece12e4130ab0dbf264d +9898 NIST.py 97abc84fce85c47b789822715a1945ab84cc052a32340c861141c1af66bab644 +6907 WikipediaParser.py 5d6de911c773129a34b76c40a9b547aafc67644a15f39cd0be6afc7a16fb0f97 +0 __init__.py e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 +1262 source.py 16c4cdfca849b7dc2bc89d7a6f7ad021f4aa1d04234394312f1d0edf0fd9c5a4 +3026 spider.py 1ffba2512988b7a6b535a4a31a4ef688ece4f8c595c3d50355c34ef46b23e44a +1081 LICENSE 36951e5f1910bad3e008ab7228f35ad8933192e52d3c3ae6a5e875765e27192c +3965 README.md d21236d6a175be28ef8e2fee8a256e95b6a513163e3f1071c26c62e9093db7f3 +3659 x fourmi.py 81781ed7299e447e6fc551fba69e62cd7a1d63f27dfa063927f4c5c10f5ac331 +200850 log.txt d76e741f9e7b67c2574e9cdbbe499ea4861f6e0bd11e5962fdaf9d8720effef8 +184692 results.csv 31132f7f394babeb5dfd249aaa714756017b2e1b314b6715f57e6ad9524e5be8|d0bb724f6d714ec7a4a1ad2052f70dd4510b5ac08d616e24b5e9a903dedab586 +261 scrapy.cfg 624c068fd06303daa65b8e0d0d3ef88ac1f123be2694ef5b4f3f9a9dcd983f85 + tests/ +1 __init__.py 01ba4719c80b6fe911b091a7c05124b64eeece964e09c058ef8f9805daca546b +2837 test_configurator.py 4a0eb6e7121eb09a63ab5cb797570d1a42080c5346c3b8b365da56eefa599e80 +1892 test_pipeline.py 387a336b0f36722a20e712aa033e5771c44f9e92561dd73acffd53d622c52031 +1260 test_sourceloader.py b108b4b80adcdb7401273a9823b1f1a19eb5178776186eb5a9976aed8b1ee869 +2113 test_spider.py 300f280377b522737be0d8e4a80031ab118a4011bdbb92131e9c400fcdab6299 + utils/ +0 __init__.py e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 +3552 configurator.py e2b7e0ee6c1fef4373785dfe5df8ec6950f31ce6a5d9632b69a66ea3d1eaf921 +2537 sourceloader.py f5a5ac2a6aba0658dbe11361f465caabcf3c06c5c8dc9a631874211cc19d2d37 +``` + +#### Ignore + +``` +/SIGNED.md +``` + +#### Presets + +``` +git # ignore .git and anything as described by .gitignore files +dropbox # ignore .dropbox-cache and other Dropbox-related files +kb # ignore anything as described by .kbignore files +``` + + + +### End signed statement + +
+ +#### Notes + +With keybase you can sign any directory's contents, whether it's a git repo, +source code distribution, or a personal documents folder. It aims to replace the drudgery of: + + 1. comparing a zipped file to a detached statement + 2. downloading a public key + 3. confirming it is in fact the author's by reviewing public statements they've made, using it + +All in one simple command: + +```bash +keybase dir verify +``` + +There are lots of options, including assertions for automating your checks. + +For more info, check out https://keybase.io/docs/command_line/code_signing \ No newline at end of file From c2e78298514e3877f63fb8799f8890c764647746 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 15 Jun 2014 21:19:32 +0200 Subject: [PATCH 27/38] Bumped version number --- fourmi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fourmi.py b/fourmi.py index 55a3c20..15c3c0d 100755 --- a/fourmi.py +++ b/fourmi.py @@ -69,7 +69,7 @@ def search(docopt_arguments, source_loader): # The start for the Fourmi Command Line interface. if __name__ == '__main__': - arguments = docopt.docopt(__doc__, version='Fourmi - V0.5.1') + arguments = docopt.docopt(__doc__, version='Fourmi - V0.5.2') loader = SourceLoader() if arguments["--include"]: From 449ac71f236d25df5f4e12b0fd50f66f8e315598 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 15 Jun 2014 21:20:39 +0200 Subject: [PATCH 28/38] Added changes to the changelog --- Changelog.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Changelog.md b/Changelog.md index 2a63786..577c47c 100644 --- a/Changelog.md +++ b/Changelog.md @@ -1,3 +1,7 @@ +### v0.5.2 +- FIX: Signatured used to contain untracked and older files, current signature +should be correct. + ### v0.5.1 - UPDATED: Logging functionality from command line - DEV: Code cleanup and extra tests \ No newline at end of file From cb9fe9e1027cedda5a371e3496faa3c8adaefc1e Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 15 Jun 2014 21:21:10 +0200 Subject: [PATCH 29/38] The new signature --- SIGNED.md | 91 +++++++++++++++++++++++++++---------------------------- 1 file changed, 44 insertions(+), 47 deletions(-) diff --git a/SIGNED.md b/SIGNED.md index 79e66cb..35d0887 100644 --- a/SIGNED.md +++ b/SIGNED.md @@ -3,19 +3,19 @@ -----BEGIN PGP SIGNATURE----- Version: GnuPG v1.4.11 (GNU/Linux) -iQIcBAABAgAGBQJTnfAAAAoJEJrQ9RIUCT6/KZIQAME07yzAG5hnqsQof5ESoeQs -5wBxAhiBIX/0yn3qIT/eMh0ubCKUZsqJ3/PzUljeMJ6CGtwxFYfTWkgjYlOoAz9G -fS7CjPmRPyiu+MFo5he+oVRmLUMqfuLUrCyuIxJwMXq5YbQvzyqiffvxr8VRULtV -3c0drWfQMX1ZeAWSIYN0xuMndzvaqIAQU6o4tSQf/rUiKlM2NnTDNUHu2PY9FED/ -IJwM/IgAMAkJARyL7ltq6pHzORsu7sd2Nhv0esa0Gs2GSuRjKueeMZvJzpDAufy9 -bWn9EqKhVwPR6zWnXRmNj9Ymj1w167hIUYcBdFhC7kie5zv9+pDE6d/s7pw/Rejd -L0k8LKBGtJ8o7SKYR9kcNLDWXEnHjfCraD+14FMYqQPcz2ekoV6Exv/mP8qRPwUc -b+FtjJtW8fEiOMAyjMOvLTzYbCVwjdErAqgNdHeSByi1nxfrphjajRiNUt7fVimJ -++QZzKCj6xN2MuTJ41KbZ8teiUXwQB4OKKij0fgoy0RBwW0vqH6MF7cCKm1zT1Qa -9FGlBU2jSybQqUu4lJ/eUjO/3tQMhJErQJU/i+6lwi7OMnS9J/g17Heghp5Hxyhc -VWvhR56pbWLIL2XQqDGGEqPDIzXohHnbRJ1N71b06akIvIIrTqc6Glu4PJeUG/Pe -EF8/jBwydxbKUOyKRSQS -=xWbc +iQIcBAABAgAGBQJTnfIhAAoJEJrQ9RIUCT6/SbIQANKLzmkxwH11vM84kkRbmgHE +d3jLYYNEDQArCTOObYxvyrvE0BK2fhzbdBfccO9rLqu19FnBhcN3WLbkb/WM+2af +G8GkC7yFsWPs1lkrBbouvObPmqwVChGhRETd7xNU6D1NRGKLDT9lXv1FkjU2qt6P +CQwF129aTRzCZ9XGoVKG9wnKuaPm2EYkYHKlG3eck+eeKklTlmJcGi5ON7iGsUpE +hNVrSg8WwN4SzpOEgXlyBn9Zzci81XeZqy3Fnp7u1CEq5tOuWITXa1i5wQ9Jq/2n +5HP0XLbY5grW6Cpqh5jDUiX/XnNtCwpPWRnz4lCLswwMIDLCpq5tJubIay7GMvsx +fV1+UUGAR1EcWNWI0R6XJNbb2EHzidDJcLWlVo1InJDxevECq3CNnh7fRC9bixiG +EV0C/Abig/rvyX5cc9ozmwO3e0gzmtwwyywxOWLzJgVns3jfuA9MhaGDczIC1kuR +Tig9ciByErhT6v8SjgS3gyhWc+tRSx5R3M1Y78CungW3c61VA3Jo/fWHY6Db0JwH +9lVnGU4Ql4mbQQQAv7e/6r6ZhYwoBsAkOKdqT4Dn8aLaItZ8+oB2FXEl/P6V55hN +ambDSt476mwJcyDyIIwxTLyqcop2zYBdaUATe8lwo+0OoXuCLfjnThkHzy2dA0CP +xqHuzkM3Pdb6qOU3cUK7 +=PVt+ -----END PGP SIGNATURE----- ``` @@ -27,40 +27,37 @@ EF8/jBwydxbKUOyKRSQS #### Expect ``` -size exec file contents - ./ -17591 .coverage 1dd1207846db74e407d3a4a1951b8e81934a4693385d39f6c337a224375bad39|1b7ead09cf213b5a9545557be982aaa30238b689bb54adf604f82b12ef521eb2 -375 .gitignore d2e475a6a4fa51422cac0a07495914e776858fb9ab9c8937a4d491a3e042d6b1 -464 .travis.yml 3063ba078607b8d16bd6467afc15fbbaa4b26c1e30be5ce7cef453cfccbaa95c -97 Changelog.md bcbce9a33bbbbcd18fd7788e6dc3a9c4b13dff7128ea99968994c1b290ddc931 - FourmiCrawler/ -0 __init__.py e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 -304 items.py b00d49a3d53fa13306c7f8b023adb93ab88423c4fce46600689814f6b02bb806 -2178 pipelines.py f9b7b84938060751e15e45de5133dffe50c798bff2a20019206fe7c9d677ad49 -716 settings.py 37a8f63e123bccc77076d574617a522b30c1d7c5e893ec3d78cc40e1563dd8a6 - sources/ -9991 ChemSpider.py 847013e34c5c3683ec66a337837287512b4bab9fbea2ece12e4130ab0dbf264d -9898 NIST.py 97abc84fce85c47b789822715a1945ab84cc052a32340c861141c1af66bab644 -6907 WikipediaParser.py 5d6de911c773129a34b76c40a9b547aafc67644a15f39cd0be6afc7a16fb0f97 -0 __init__.py e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 -1262 source.py 16c4cdfca849b7dc2bc89d7a6f7ad021f4aa1d04234394312f1d0edf0fd9c5a4 -3026 spider.py 1ffba2512988b7a6b535a4a31a4ef688ece4f8c595c3d50355c34ef46b23e44a -1081 LICENSE 36951e5f1910bad3e008ab7228f35ad8933192e52d3c3ae6a5e875765e27192c -3965 README.md d21236d6a175be28ef8e2fee8a256e95b6a513163e3f1071c26c62e9093db7f3 -3659 x fourmi.py 81781ed7299e447e6fc551fba69e62cd7a1d63f27dfa063927f4c5c10f5ac331 -200850 log.txt d76e741f9e7b67c2574e9cdbbe499ea4861f6e0bd11e5962fdaf9d8720effef8 -184692 results.csv 31132f7f394babeb5dfd249aaa714756017b2e1b314b6715f57e6ad9524e5be8|d0bb724f6d714ec7a4a1ad2052f70dd4510b5ac08d616e24b5e9a903dedab586 -261 scrapy.cfg 624c068fd06303daa65b8e0d0d3ef88ac1f123be2694ef5b4f3f9a9dcd983f85 - tests/ -1 __init__.py 01ba4719c80b6fe911b091a7c05124b64eeece964e09c058ef8f9805daca546b -2837 test_configurator.py 4a0eb6e7121eb09a63ab5cb797570d1a42080c5346c3b8b365da56eefa599e80 -1892 test_pipeline.py 387a336b0f36722a20e712aa033e5771c44f9e92561dd73acffd53d622c52031 -1260 test_sourceloader.py b108b4b80adcdb7401273a9823b1f1a19eb5178776186eb5a9976aed8b1ee869 -2113 test_spider.py 300f280377b522737be0d8e4a80031ab118a4011bdbb92131e9c400fcdab6299 - utils/ -0 __init__.py e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 -3552 configurator.py e2b7e0ee6c1fef4373785dfe5df8ec6950f31ce6a5d9632b69a66ea3d1eaf921 -2537 sourceloader.py f5a5ac2a6aba0658dbe11361f465caabcf3c06c5c8dc9a631874211cc19d2d37 +size exec file contents + ./ +375 .gitignore d2e475a6a4fa51422cac0a07495914e776858fb9ab9c8937a4d491a3e042d6b1 +464 .travis.yml 3063ba078607b8d16bd6467afc15fbbaa4b26c1e30be5ce7cef453cfccbaa95c +208 Changelog.md 370ecb699890e839e73e22822286b2b2ee7e7ec6c485908e10b8c30e7f9acd47 + FourmiCrawler/ +0 __init__.py e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 +304 items.py b00d49a3d53fa13306c7f8b023adb93ab88423c4fce46600689814f6b02bb806 +2178 pipelines.py f9b7b84938060751e15e45de5133dffe50c798bff2a20019206fe7c9d677ad49 +716 settings.py 37a8f63e123bccc77076d574617a522b30c1d7c5e893ec3d78cc40e1563dd8a6 + sources/ +9991 ChemSpider.py 847013e34c5c3683ec66a337837287512b4bab9fbea2ece12e4130ab0dbf264d +9898 NIST.py 97abc84fce85c47b789822715a1945ab84cc052a32340c861141c1af66bab644 +6907 WikipediaParser.py 5d6de911c773129a34b76c40a9b547aafc67644a15f39cd0be6afc7a16fb0f97 +0 __init__.py e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 +1262 source.py 16c4cdfca849b7dc2bc89d7a6f7ad021f4aa1d04234394312f1d0edf0fd9c5a4 +3026 spider.py 1ffba2512988b7a6b535a4a31a4ef688ece4f8c595c3d50355c34ef46b23e44a +1081 LICENSE 36951e5f1910bad3e008ab7228f35ad8933192e52d3c3ae6a5e875765e27192c +3965 README.md d21236d6a175be28ef8e2fee8a256e95b6a513163e3f1071c26c62e9093db7f3 +3659 x fourmi.py 7b4202ecfc8726fcc3f211c459aada7f5610fa4c4c0a7b916e44fc12d71010a1 +261 scrapy.cfg 624c068fd06303daa65b8e0d0d3ef88ac1f123be2694ef5b4f3f9a9dcd983f85 + tests/ +1 __init__.py 01ba4719c80b6fe911b091a7c05124b64eeece964e09c058ef8f9805daca546b +2837 test_configurator.py 4a0eb6e7121eb09a63ab5cb797570d1a42080c5346c3b8b365da56eefa599e80 +1892 test_pipeline.py 387a336b0f36722a20e712aa033e5771c44f9e92561dd73acffd53d622c52031 +1260 test_sourceloader.py b108b4b80adcdb7401273a9823b1f1a19eb5178776186eb5a9976aed8b1ee869 +2113 test_spider.py 300f280377b522737be0d8e4a80031ab118a4011bdbb92131e9c400fcdab6299 + utils/ +0 __init__.py e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 +3552 configurator.py e2b7e0ee6c1fef4373785dfe5df8ec6950f31ce6a5d9632b69a66ea3d1eaf921 +2537 sourceloader.py f5a5ac2a6aba0658dbe11361f465caabcf3c06c5c8dc9a631874211cc19d2d37 ``` #### Ignore From 3f0b2030579970861c9d4d8194d01ddb490042fa Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Mon, 16 Jun 2014 15:03:59 +0200 Subject: [PATCH 30/38] Possibility of combination of verbose and include/exclude in options --- Changelog.md | 3 +++ fourmi.py | 3 +-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/Changelog.md b/Changelog.md index 577c47c..3ed41dc 100644 --- a/Changelog.md +++ b/Changelog.md @@ -1,3 +1,6 @@ +### v0.5.3 +- FIX: It is now again possible to use both verbose and the source inclusion/exclusion options + ### v0.5.2 - FIX: Signatured used to contain untracked and older files, current signature should be correct. diff --git a/fourmi.py b/fourmi.py index 15c3c0d..319e35d 100755 --- a/fourmi.py +++ b/fourmi.py @@ -5,8 +5,7 @@ Fourmi, a web scraper build to search specific information for a given compound Usage: fourmi search fourmi [options] search - fourmi [-v | -vv | -vvv] [options] search - fourmi [options] [--include= | --exclude=] search + fourmi [options] [-v | -vv | -vvv] [--include= | --exclude=] search fourmi list fourmi [--include= | --exclude=] list fourmi -h | --help From f8766986b0b21b32bf409764b81ff51e692595ee Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Mon, 16 Jun 2014 15:30:05 +0200 Subject: [PATCH 31/38] Disable logging when not verbose --- fourmi.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fourmi.py b/fourmi.py index 319e35d..9408818 100755 --- a/fourmi.py +++ b/fourmi.py @@ -61,7 +61,8 @@ def search(docopt_arguments, source_loader): conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"]) setup_crawler(docopt_arguments[""], conf.scrapy_settings, source_loader, docopt_arguments["--attributes"].split(',')) - log.start(conf.scrapy_settings.get("LOG_FILE"), + if conf.scrapy_settings.getbool("LOG_ENABLED"): + log.start(conf.scrapy_settings.get("LOG_FILE"), conf.scrapy_settings.get("LOG_LEVEL"), conf.scrapy_settings.get("LOG_STDOUT")) reactor.run() From 966087260212b4e25768f91934dcf6d71c86c745 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Mon, 16 Jun 2014 15:30:05 +0200 Subject: [PATCH 32/38] Disable logging when not verbose --- Changelog.md | 1 + fourmi.py | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/Changelog.md b/Changelog.md index 3ed41dc..99d61fb 100644 --- a/Changelog.md +++ b/Changelog.md @@ -1,5 +1,6 @@ ### v0.5.3 - FIX: It is now again possible to use both verbose and the source inclusion/exclusion options +- FIX: Logging is now "actually" disabled if not using the verbose option. ### v0.5.2 - FIX: Signatured used to contain untracked and older files, current signature diff --git a/fourmi.py b/fourmi.py index 319e35d..9408818 100755 --- a/fourmi.py +++ b/fourmi.py @@ -61,7 +61,8 @@ def search(docopt_arguments, source_loader): conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"]) setup_crawler(docopt_arguments[""], conf.scrapy_settings, source_loader, docopt_arguments["--attributes"].split(',')) - log.start(conf.scrapy_settings.get("LOG_FILE"), + if conf.scrapy_settings.getbool("LOG_ENABLED"): + log.start(conf.scrapy_settings.get("LOG_FILE"), conf.scrapy_settings.get("LOG_LEVEL"), conf.scrapy_settings.get("LOG_STDOUT")) reactor.run() From 4dc557d9e8e7bb5ac529e0201f577e23aeca29cb Mon Sep 17 00:00:00 2001 From: Nout van Deijck Date: Tue, 17 Jun 2014 00:09:17 +0200 Subject: [PATCH 33/38] Finish plugin (comments, log messages, etc) --- FourmiCrawler/sources/PubChem.py | 33 +++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py index ab6a99e..fc8250b 100644 --- a/FourmiCrawler/sources/PubChem.py +++ b/FourmiCrawler/sources/PubChem.py @@ -9,9 +9,11 @@ import re class PubChem(Source): """ PubChem scraper for chemical properties - This parser parses the part on PubChem pages that gives Chemical and Physical properties of a substance. + This parser parses the part on PubChem pages that gives Chemical and Physical properties of a substance, + including sources of the values of properties. """ + #PubChem has its data on compound name, properties and their values on different html pages, so different URLs used website = 'https://*.ncbi.nlm.nih.gov/*' website_www = 'https://www.ncbi.nlm.nih.gov/*' website_pubchem = 'https://pubchem.ncbi.nlm.nih.gov/*' @@ -26,7 +28,11 @@ class PubChem(Source): self.cfg = config def parse(self, response): - """ Distributes the above described behaviour """ + """ + Distributes the above described behaviour + :param response: The incoming search request + :return Returns the found properties if response is unique or returns none if it's already known + """ requests = [] log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG) @@ -46,12 +52,19 @@ class PubChem(Source): n = re.search(r'cid=(\d+)',response.url) if n: cid = n.group(1) - log.msg('cid: %s' % cid, level=log.DEBUG) - requests.append(Request(url=self.website_pubchem[:-1] + self.data_url % cid, callback=self.parse_data)) + log.msg('cid: %s' % cid, level=log.DEBUG) #getting the right id of the compound with which it can reach + # the seperate html page which contains the properties and their values + #using this cid to get the right url and scrape it + requests.append(Request(url=self.website_pubchem[:-1] + self.data_url % cid, callback=self.parse_data)) return requests def parse_data(self, response): + """ + Parse data found in 'Chemical and Physical properties' part of a substance page. + :param response: The response with the page to parse + :return: requests: Returns a list of properties with their values, source, etc. + """ log.msg('parsing data', level=log.DEBUG) requests = [] @@ -59,8 +72,8 @@ class PubChem(Source): props = sel.xpath('//div') for prop in props: - prop_name = ''.join(prop.xpath('b/text()').extract()) - if prop.xpath('a'): + prop_name = ''.join(prop.xpath('b/text()').extract()) # name of property that it is parsing + if prop.xpath('a'): # parsing for single value in property prop_source = ''.join(prop.xpath('a/@title').extract()) prop_value = ''.join(prop.xpath('a/text()').extract()) new_prop = Result({ @@ -70,8 +83,11 @@ class PubChem(Source): 'reliability': 'Unknown', 'conditions': '' }) + log.msg('PubChem prop: |%s| |%s| |%s|' % + (new_prop['attribute'], new_prop['value'], + new_prop['source']), level=log.DEBUG) requests.append(new_prop) - elif prop.xpath('ul'): + elif prop.xpath('ul'): # parsing for multiple values (list) in property prop_values = prop.xpath('ul//li') for prop_li in prop_values: prop_value = ''.join(prop_li.xpath('a/text()').extract()) @@ -83,6 +99,9 @@ class PubChem(Source): 'reliability': 'Unknown', 'conditions': '' }) + log.msg('PubChem prop: |%s| |%s| |%s|' % + (new_prop['attribute'], new_prop['value'], + new_prop['source']), level=log.DEBUG) requests.append(new_prop) return requests From 56e1d3cfb6a785b3a2b444a93eeca2fb02b2be88 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Tue, 17 Jun 2014 00:28:01 +0200 Subject: [PATCH 34/38] No cofig files should be included on github --- sources.cfg | 15 --------------- 1 file changed, 15 deletions(-) delete mode 100644 sources.cfg diff --git a/sources.cfg b/sources.cfg deleted file mode 100644 index a9fa2fb..0000000 --- a/sources.cfg +++ /dev/null @@ -1,15 +0,0 @@ -[DEFAULT] -reliability = Unknown - -[ChemSpider] -reliability = High -token = 052bfd06-5ce4-43d6-bf12-89eabefd2338 - -[NIST] -reliability = High - -[WikipediaParser] -reliability = Medium - -[PubChem] -reliability = High \ No newline at end of file From 6e16e9f23e19016ac5a5d3eff3dd4e07cdf9e8c8 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Tue, 17 Jun 2014 00:33:08 +0200 Subject: [PATCH 35/38] TODO on sppofing user agent --- FourmiCrawler/settings.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/FourmiCrawler/settings.py b/FourmiCrawler/settings.py index 320f573..338f224 100644 --- a/FourmiCrawler/settings.py +++ b/FourmiCrawler/settings.py @@ -18,10 +18,10 @@ ITEM_PIPELINES = { FEED_URI = 'results.json' FEED_FORMAT = 'jsonlines' -USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36' - - # Crawl responsibly by identifying yourself (and your website) on the # user-agent +# [todo] - Check for repercussions on spoofing the user agent + # USER_AGENT = 'FourmiCrawler (+http://www.yourdomain.com)' +USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36' From 25bf003bdbda36095bc5d972820bfb5666c8765c Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Tue, 17 Jun 2014 00:35:50 +0200 Subject: [PATCH 36/38] Added pubchem to changelod --- Changelog.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Changelog.md b/Changelog.md index 99d61fb..b1885f6 100644 --- a/Changelog.md +++ b/Changelog.md @@ -1,6 +1,7 @@ ### v0.5.3 - FIX: It is now again possible to use both verbose and the source inclusion/exclusion options - FIX: Logging is now "actually" disabled if not using the verbose option. +- FEATURE: Added support for PubChem ### v0.5.2 - FIX: Signatured used to contain untracked and older files, current signature @@ -8,4 +9,4 @@ should be correct. ### v0.5.1 - UPDATED: Logging functionality from command line -- DEV: Code cleanup and extra tests \ No newline at end of file +- DEV: Code cleanup and extra tests From bb62c335d2872d16d40e04830646adc6df59d20a Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Tue, 17 Jun 2014 00:36:31 +0200 Subject: [PATCH 37/38] Bumped version number --- fourmi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fourmi.py b/fourmi.py index 9408818..86f2808 100755 --- a/fourmi.py +++ b/fourmi.py @@ -69,7 +69,7 @@ def search(docopt_arguments, source_loader): # The start for the Fourmi Command Line interface. if __name__ == '__main__': - arguments = docopt.docopt(__doc__, version='Fourmi - V0.5.2') + arguments = docopt.docopt(__doc__, version='Fourmi - V0.5.3') loader = SourceLoader() if arguments["--include"]: From 35fe51d9161ba1d9bc2147125c54e0fb701008ea Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Tue, 17 Jun 2014 00:37:34 +0200 Subject: [PATCH 38/38] Signed the new version --- SIGNED.md | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/SIGNED.md b/SIGNED.md index 35d0887..3fc4507 100644 --- a/SIGNED.md +++ b/SIGNED.md @@ -3,19 +3,19 @@ -----BEGIN PGP SIGNATURE----- Version: GnuPG v1.4.11 (GNU/Linux) -iQIcBAABAgAGBQJTnfIhAAoJEJrQ9RIUCT6/SbIQANKLzmkxwH11vM84kkRbmgHE -d3jLYYNEDQArCTOObYxvyrvE0BK2fhzbdBfccO9rLqu19FnBhcN3WLbkb/WM+2af -G8GkC7yFsWPs1lkrBbouvObPmqwVChGhRETd7xNU6D1NRGKLDT9lXv1FkjU2qt6P -CQwF129aTRzCZ9XGoVKG9wnKuaPm2EYkYHKlG3eck+eeKklTlmJcGi5ON7iGsUpE -hNVrSg8WwN4SzpOEgXlyBn9Zzci81XeZqy3Fnp7u1CEq5tOuWITXa1i5wQ9Jq/2n -5HP0XLbY5grW6Cpqh5jDUiX/XnNtCwpPWRnz4lCLswwMIDLCpq5tJubIay7GMvsx -fV1+UUGAR1EcWNWI0R6XJNbb2EHzidDJcLWlVo1InJDxevECq3CNnh7fRC9bixiG -EV0C/Abig/rvyX5cc9ozmwO3e0gzmtwwyywxOWLzJgVns3jfuA9MhaGDczIC1kuR -Tig9ciByErhT6v8SjgS3gyhWc+tRSx5R3M1Y78CungW3c61VA3Jo/fWHY6Db0JwH -9lVnGU4Ql4mbQQQAv7e/6r6ZhYwoBsAkOKdqT4Dn8aLaItZ8+oB2FXEl/P6V55hN -ambDSt476mwJcyDyIIwxTLyqcop2zYBdaUATe8lwo+0OoXuCLfjnThkHzy2dA0CP -xqHuzkM3Pdb6qOU3cUK7 -=PVt+ +iQIcBAABAgAGBQJTn3GgAAoJEJrQ9RIUCT6/CI4P/RSAQrd6JugGZoQu/gNdW6eB +MYCybqYGZiieVhUaGOnFNVlp68YpXH+sP/Uc6hXEX30UQEsDmhMeT5NA7ZMS+zJ9 +MNHGQdJq22lGb3+VoVBV4RTMdkQXOXvx6p5biskjIEtM3tfTxP529GvAX2TFUNnt +gGWk28EDr30M95XwDxwWo+57Xv8VtSb3VSvXEbrdwGYf8EoQo9oPtzYQ0YcdupcC +ET8bukYVcwpAjoTnPlEy89TiHHohwmimr2ASXeQ64Ks5wfjzcF7NENCAmaAfR+KI +VLLuGqdWMBx1ewVuAXTCZ0Mga/kBoRUaO0PC13UmL8LhhZY9Z3cwD4UnPU35/RQi +IbLfQcZHf/gEvyMeiTYCsyWpm+/xxn1+EfHol4/Q9VSXzZgRBX05Ik6tqeCvjdgG +4PyHBaJTTm/HfMNdg3mr1mbyjTv5UxglEyPv+Y4NdfoVfepkXsXbzvNSyVffZ3Bw +UaFp7KzIC4Jugdpv63FleiAdDY0+iZ5shH86wD1+HJ0/a87kn5Ao1yESby7J7U+f +poZQYeMFeuC0T5hY/3iYoyvZ68oH918ESESiucSulp5BvfwuqGL2+xo5uJIwGYXE +3IDQC7xbA14JHX86IVJlSHAD33iWyiC+5yjw4/bRRVl37KPsLdHiXH3YIRnF5I2I +ZbM/uDYyJdZbBe4UoCoF +=AMhi -----END PGP SIGNATURE----- ``` @@ -31,22 +31,23 @@ size exec file contents ./ 375 .gitignore d2e475a6a4fa51422cac0a07495914e776858fb9ab9c8937a4d491a3e042d6b1 464 .travis.yml 3063ba078607b8d16bd6467afc15fbbaa4b26c1e30be5ce7cef453cfccbaa95c -208 Changelog.md 370ecb699890e839e73e22822286b2b2ee7e7ec6c485908e10b8c30e7f9acd47 +428 Changelog.md c7791d1914ddca9ff1549d90468a79787a7feafe94cecd756e3d7cbd4bcbc7df FourmiCrawler/ 0 __init__.py e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 304 items.py b00d49a3d53fa13306c7f8b023adb93ab88423c4fce46600689814f6b02bb806 2178 pipelines.py f9b7b84938060751e15e45de5133dffe50c798bff2a20019206fe7c9d677ad49 -716 settings.py 37a8f63e123bccc77076d574617a522b30c1d7c5e893ec3d78cc40e1563dd8a6 +914 settings.py 0be2eaf8e83e85ed27754c896421180fc80cb5ce44449aa9f1048e465d1a96f2 sources/ 9991 ChemSpider.py 847013e34c5c3683ec66a337837287512b4bab9fbea2ece12e4130ab0dbf264d 9898 NIST.py 97abc84fce85c47b789822715a1945ab84cc052a32340c861141c1af66bab644 +4754 PubChem.py 58ed4c92519e385f2768cf8034b006b18f8a21632cb1c5a0849b1a329a8c6ffb 6907 WikipediaParser.py 5d6de911c773129a34b76c40a9b547aafc67644a15f39cd0be6afc7a16fb0f97 0 __init__.py e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 1262 source.py 16c4cdfca849b7dc2bc89d7a6f7ad021f4aa1d04234394312f1d0edf0fd9c5a4 3026 spider.py 1ffba2512988b7a6b535a4a31a4ef688ece4f8c595c3d50355c34ef46b23e44a 1081 LICENSE 36951e5f1910bad3e008ab7228f35ad8933192e52d3c3ae6a5e875765e27192c 3965 README.md d21236d6a175be28ef8e2fee8a256e95b6a513163e3f1071c26c62e9093db7f3 -3659 x fourmi.py 7b4202ecfc8726fcc3f211c459aada7f5610fa4c4c0a7b916e44fc12d71010a1 +3676 x fourmi.py 2ff89f97fd2a49d08417d9ab6cf08e88944d0c45f54ec84550b530be48676c23 261 scrapy.cfg 624c068fd06303daa65b8e0d0d3ef88ac1f123be2694ef5b4f3f9a9dcd983f85 tests/ 1 __init__.py 01ba4719c80b6fe911b091a7c05124b64eeece964e09c058ef8f9805daca546b