From 9c34267fb126f047dec809706fbf313c19980422 Mon Sep 17 00:00:00 2001 From: RTB Date: Tue, 10 Jun 2014 12:49:00 +0200 Subject: [PATCH 01/27] added function comments to NIST --- FourmiCrawler/sources/NIST.py | 74 ++++++++++++++++++++++++++++++----- 1 file changed, 64 insertions(+), 10 deletions(-) diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py index 3c323ef..a090700 100644 --- a/FourmiCrawler/sources/NIST.py +++ b/FourmiCrawler/sources/NIST.py @@ -13,8 +13,8 @@ from FourmiCrawler.items import Result # Result item, but should be included eventually. class NIST(Source): - """NIST Scraper plugin - + """ + NIST Scraper plugin This plugin manages searching for a chemical on the NIST website and parsing the resulting page if the chemical exists on NIST. """ @@ -25,11 +25,22 @@ class NIST(Source): cfg = {} def __init__(self, config={}): + """ + Initialization of NIST scraper + :param config: configuration variables for this scraper, must contain + 'reliability' key. + """ Source.__init__(self, config) self.ignore_list = set() self.cfg = config def parse(self, response): + """ + This function is called when a Response matching the variable + 'website' is available for parsing the Response object. + :param response: The Scrapy Response object to be parsed + :return: a list of Result items and Request objects + """ sel = Selector(response) title = sel.xpath('head/title/text()').extract()[0] @@ -84,8 +95,12 @@ class NIST(Source): return requests def parse_generic_info(self, sel): - """Parses: synonyms, chemical formula, molecular weight, InChI, - InChiKey, CAS number + """ + This function parses: synonyms, chemical formula, molecular weight, + InChI, InChiKey, CAS number + :param sel: A Selector object of the entire page in the original + response + :return: a list of Result items """ ul = sel.xpath('body/ul[li/strong="IUPAC Standard InChI:"]') li = ul.xpath('li') @@ -125,8 +140,13 @@ class NIST(Source): return requests def parse_aggregate_data(self, table, symbol_table): - """Parses the table(s) which contain possible links to individual - data points + """ + This function parses the table(s) which contain possible links to + individual data points + :param table: a Selector object of the table to be parsed + :param symbol_table: a dictionary containing translations of raw HTML + tags to human readable names + :return: a list of Result items and Request objects """ results = [] for tr in table.xpath('tr[td]'): @@ -159,7 +179,13 @@ class NIST(Source): return results def parse_transition_data(self, table, summary): - """Parses the table containing properties regarding phase changes""" + """ + This function parses the table containing properties regarding phase + changes + :param table: a Selector object of the table to be parsed + :param summary: the name of the property + :return: a list of Result items + """ results = [] tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract()) @@ -180,10 +206,14 @@ class NIST(Source): return results def parse_generic_data(self, table, summary): - """Parses the common tables of 4 and 5 rows. Assumes they are of the + """ + Parses the common tables of 4 and 5 rows. Assumes they are of the form: Symbol (unit)|Temperature (K)|Method|Reference|Comment Symbol (unit)|Temperature (K)|Reference|Comment + :param table: a Selector object of the table to be parsed + :param summary: the name of the property + :return: a list of Result items """ results = [] @@ -204,7 +234,13 @@ class NIST(Source): return results def parse_antoine_data(self, table, summary): - """Parse table containing parameters for the Antione equation""" + """ + This function parses the table containing parameters for the Antione + equation + :param table: a Selector object of the table to be parsed + :param summary: the name of the property + :return: a list of Result items + """ results = [] for tr in table.xpath('tr[td]'): @@ -219,7 +255,12 @@ class NIST(Source): return results def parse_individual_datapoints(self, response): - """Parses the page linked from aggregate data""" + """ + This function parses the 'individual data points' page linked from + the aggregate data table(s) + :param response: the Scrapy Response object to be parsed + :return: a list of Result items + """ sel = Selector(response) table = sel.xpath('//table[@class="data"]')[0] @@ -255,6 +296,14 @@ class NIST(Source): return results def newresult(self, attribute, value, conditions=''): + """ + This function abstracts from the Result item and provides default + values + :param attribute: the name of the attribute + :param value: the value of the attribute + :param conditions: optional conditions regarding the value + :return: A Result item + """ return Result({ 'attribute': attribute, 'value': value, @@ -264,6 +313,11 @@ class NIST(Source): }) def new_compound_request(self, compound): + """ + This function is called when a new synonym is returned to the spider + to generate new requests + :param compound: the name of the compound to search for + """ if compound not in self.ignore_list: self.ignore_list.update(compound) return Request(url=self.website[:-1] + self.search % compound, From 1b7c0b44a920e767039b926bda9b3ea946ac0a7a Mon Sep 17 00:00:00 2001 From: RTB Date: Tue, 10 Jun 2014 13:01:47 +0200 Subject: [PATCH 02/27] moved identifying and distribution of parsing of tables to a new function parse_tables --- FourmiCrawler/sources/NIST.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py index a090700..08cb299 100644 --- a/FourmiCrawler/sources/NIST.py +++ b/FourmiCrawler/sources/NIST.py @@ -65,6 +65,21 @@ class NIST(Source): log.msg('NIST symbol: |%s|, name: |%s|' % (symbol, name), level=log.DEBUG) + requests.extend(self.parse_tables(sel, symbol_table)) + + return requests + + def parse_tables(self, sel, symbol_table): + """ + This function identifies and distributes parsing of tables to other + functions below. + :param sel: A Selector object of the whole page + :param symbol_table: a dictionary containing translations of raw HTML + tags to human readable names + :return: a list of Result items and Requests + """ + requests = [] + for table in sel.xpath('//table[@class="data"]'): summary = table.xpath('@summary').extract()[0] if summary == 'One dimensional data': From 7879d523ec5b9f33fc24dbe39d48dc5a4c1eb9da Mon Sep 17 00:00:00 2001 From: RTB Date: Tue, 10 Jun 2014 13:02:39 +0200 Subject: [PATCH 03/27] removed unneeded class variable cfg --- FourmiCrawler/sources/NIST.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py index 08cb299..14312fa 100644 --- a/FourmiCrawler/sources/NIST.py +++ b/FourmiCrawler/sources/NIST.py @@ -22,8 +22,6 @@ class NIST(Source): search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on' - cfg = {} - def __init__(self, config={}): """ Initialization of NIST scraper From a15a66cc7793ff273174b9ab061baf5231f81ce4 Mon Sep 17 00:00:00 2001 From: RTB Date: Tue, 10 Jun 2014 13:13:24 +0200 Subject: [PATCH 04/27] made get_unit function --- FourmiCrawler/sources/NIST.py | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py index 14312fa..f32ea70 100644 --- a/FourmiCrawler/sources/NIST.py +++ b/FourmiCrawler/sources/NIST.py @@ -201,11 +201,7 @@ class NIST(Source): """ results = [] - tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract()) - m = re.search(r'\((.*)\)', tr_unit) - unit = '!' - if m: - unit = m.group(1) + unit = self.get_unit(table) for tr in table.xpath('tr[td]'): tds = tr.xpath('td/text()').extract() @@ -230,11 +226,7 @@ class NIST(Source): """ results = [] - tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract()) - m = re.search(r'\((.*)\)', tr_unit) - unit = '!' - if m: - unit = m.group(1) + unit = self.get_unit(table) for tr in table.xpath('tr[td]'): tds = tr.xpath('td/text()').extract() @@ -286,11 +278,7 @@ class NIST(Source): name = m.group(1) condition = m.group(2) - tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract()) - m = re.search(r'\((.*)\)', tr_unit) - unit = '!' - if m: - unit = m.group(1) + unit = self.get_unit(table) for tr in table.xpath('tr[td]'): tds = tr.xpath('td/text()').extract() @@ -308,6 +296,16 @@ class NIST(Source): return results + @staticmethod + def get_unit(table): + tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract()) + m = re.search(r'\((.*)\)', tr_unit) + unit = '!' + if m: + unit = m.group(1) + + return unit + def newresult(self, attribute, value, conditions=''): """ This function abstracts from the Result item and provides default From 5e13af5b1b68ba6b2ee58b81114a1756260a7f01 Mon Sep 17 00:00:00 2001 From: RTB Date: Tue, 10 Jun 2014 13:42:49 +0200 Subject: [PATCH 05/27] added function comments to ChemSpider --- FourmiCrawler/sources/ChemSpider.py | 65 +++++++++++++++++++++++++---- 1 file changed, 57 insertions(+), 8 deletions(-) diff --git a/FourmiCrawler/sources/ChemSpider.py b/FourmiCrawler/sources/ChemSpider.py index 87a6ee7..882c0b6 100644 --- a/FourmiCrawler/sources/ChemSpider.py +++ b/FourmiCrawler/sources/ChemSpider.py @@ -12,8 +12,8 @@ from FourmiCrawler.items import Result # [TODO] - Add checks at search request and extendedCompoundInfo on whether the token was valid or not class ChemSpider(Source): - """ChemSpider scraper for synonyms and properties - + """ + ChemSpider scraper for synonyms and properties This parser will manage searching for chemicals through the ChemsSpider API, and parsing the resulting ChemSpider page. The token required for the API should be in a configuration file @@ -27,6 +27,11 @@ class ChemSpider(Source): extendedinfo = 'MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token=' def __init__(self, config={}): + """ + Initialization of ChemSpider scraper + :param config: a dictionary of settings for this scraper, must contain + 'reliability' key + """ Source.__init__(self, config) self.cfg = config self.ignore_list = [] @@ -37,8 +42,13 @@ class ChemSpider(Source): self.search += self.cfg['token'] self.extendedinfo += self.cfg['token'] - def parse(self, response): + """ + This function is called when a Response matching the variable + 'website' is available for parsing the Response object. + :param response: the Scrapy Response object to be parsed + :return: a list of Result items and Request objects + """ sel = Selector(response) requests = [] requests_synonyms = self.parse_synonyms(sel) @@ -49,7 +59,11 @@ class ChemSpider(Source): return requests def parse_properties(self, sel): - """scrape Experimental Data and Predicted ACD/Labs tabs""" + """ + This function scrapes the Experimental Data and Predicted ACD/Labs tabs + :param sel: a Selector object of the whole page + :return: a list of Result items + """ properties = [] # Predicted - ACD/Labs tab @@ -115,7 +129,11 @@ class ChemSpider(Source): return properties def parse_synonyms(self, sel): - """Scrape list of Names and Identifiers""" + """ + This function scrapes the list of Names and Identifiers + :param sel: a Selector object of the whole page + :return: a list of Requests + """ requests = [] synonyms = [] @@ -147,7 +165,13 @@ class ChemSpider(Source): return requests def new_synonym(self, sel, name, category): - """Scrape for a single synonym at a given HTML tag""" + """ + This function scrapes for a single synonym at a given HTML tag + :param sel: a Selector object of the given HTML tag + :param name: the name of the synonym in the tag + :param category: the name of the category the synonym is labeled as + :return: a dictionary containing data on the synonym + """ self.ignore_list.append(name) language = sel.xpath('span[@class="synonym_language"]/text()') if language: @@ -183,7 +207,12 @@ class ChemSpider(Source): return synonym def parse_extendedinfo(self, response): - """Scrape data from the ChemSpider GetExtendedCompoundInfo API""" + """ + This function scrapes data from the ChemSpider GetExtendedCompoundInfo + API, if a token is present in the configuration settings + :param response: a Response object to be parsed + :return: a list of Result items + """ sel = Selector(response) properties = [] names = sel.xpath('*').xpath('name()').extract() @@ -199,6 +228,15 @@ class ChemSpider(Source): return properties def newresult(self, attribute, value, conditions='', source='ChemSpider'): + """ + This function abstracts from the Result item and provides default + values. + :param attribute: the name of the attribute + :param value: the value of the attribute + :param conditions: optional conditions regarding the value + :param source: the name of the source if it is not ChemSpider + :return: A Result item + """ return Result({ 'attribute': attribute, 'value': value, @@ -208,7 +246,13 @@ class ChemSpider(Source): }) def parse_searchrequest(self, response): - """Parse the initial response of the ChemSpider Search API """ + """ + This function parses the initial response of the ChemSpider Search API + Requires a valid token to function. + :param response: the Response object to be parsed + :return: A Request for the information page and a Request for the + extendedinfo API call + """ sel = Selector(response) log.msg('chemspider parse_searchrequest', level=log.DEBUG) sel.register_namespace('cs', 'http://www.chemspider.com/') @@ -229,6 +273,11 @@ class ChemSpider(Source): callback=self.parse_extendedinfo)] def new_compound_request(self, compound): + """ + This function is called when a new synonym is returned to the spider + to generate new requests + :param compound: the name of the compound to search for + """ if compound in self.ignore_list or self.cfg['token'] == '': return None searchurl = self.website[:-1] + self.search % compound From 41bbe5b1de33bc7c912375c28f39a52243055976 Mon Sep 17 00:00:00 2001 From: RTB Date: Tue, 10 Jun 2014 13:57:10 +0200 Subject: [PATCH 06/27] moved scraping of Predicted - ACD/Labs tab to its own function --- FourmiCrawler/sources/ChemSpider.py | 87 +++++++++++++++++------------ 1 file changed, 50 insertions(+), 37 deletions(-) diff --git a/FourmiCrawler/sources/ChemSpider.py b/FourmiCrawler/sources/ChemSpider.py index 882c0b6..ac6d5eb 100644 --- a/FourmiCrawler/sources/ChemSpider.py +++ b/FourmiCrawler/sources/ChemSpider.py @@ -66,43 +66,7 @@ class ChemSpider(Source): """ properties = [] - # Predicted - ACD/Labs tab - td_list = sel.xpath('.//table[@id="acdlabs-table"]//td').xpath( - 'normalize-space(string())') - prop_names = td_list[::2] - prop_values = td_list[1::2] - for (prop_name, prop_value) in zip(prop_names, prop_values): - # [:-1] is to remove the colon at the end, [TODO] - test for colon - prop_name = prop_name.extract().encode('utf-8')[:-1] - prop_value = prop_value.extract().encode('utf-8') - prop_conditions = '' - - # Test for properties without values, with one hardcoded exception - if not re.match(r'^\d', prop_value) or (prop_name == 'Polarizability' and prop_value == '10-24cm3'): - continue - - # Match for condition in parentheses - m = re.match(r'(.*) \((.*)\)', prop_name) - if m: - prop_name = m.group(1) - prop_conditions = m.group(2) - - # Match for condition in value seperated by an 'at' - m = re.match(r'(.*) at (.*)', prop_value) - if m: - prop_value = m.group(1) - prop_conditions = m.group(2) - - new_prop = self.newresult( - attribute=prop_name, - value=prop_value, - source='ChemSpider Predicted - ACD/Labs Tab', - conditions=prop_conditions - ) - properties.append(new_prop) - log.msg('CS prop: |%s| |%s| |%s|' % - (new_prop['attribute'], new_prop['value'], new_prop['source']), - level=log.DEBUG) + properties.extend(self.parse_acdlabstab(sel)) # Experimental Data Tab, Physico-chemical properties in particular scraped_list = sel.xpath('.//li[span="Experimental Physico-chemical ' @@ -128,6 +92,55 @@ class ChemSpider(Source): return properties + def parse_acdlabstab(self, sel): + """ + This function scrapes the 'Predicted ACD/Labs tab' under Properties + :param sel: a Selector object of the whole page + :return: a list of Request objects + """ + properties = [] + + td_list = sel.xpath('.//table[@id="acdlabs-table"]//td').xpath( + 'normalize-space(string())') + prop_names = td_list[::2] + prop_values = td_list[1::2] + for (prop_name, prop_value) in zip(prop_names, prop_values): + # [:-1] is to remove the colon at the end, [TODO] - test for colon + prop_name = prop_name.extract().encode('utf-8')[:-1] + prop_value = prop_value.extract().encode('utf-8') + prop_conditions = '' + + # Test for properties without values, with one hardcoded exception + if (not re.match(r'^\d', prop_value) or + (prop_name == 'Polarizability' and prop_value == '10-24cm3')): + continue + + # Match for condition in parentheses + m = re.match(r'(.*) \((.*)\)', prop_name) + if m: + prop_name = m.group(1) + prop_conditions = m.group(2) + + # Match for condition in value seperated by an 'at' + m = re.match(r'(.*) at (.*)', prop_value) + if m: + prop_value = m.group(1) + prop_conditions = m.group(2) + + new_prop = self.newresult( + attribute=prop_name, + value=prop_value, + source='ChemSpider Predicted - ACD/Labs Tab', + conditions=prop_conditions + ) + properties.append(new_prop) + log.msg('CS prop: |%s| |%s| |%s|' % + (new_prop['attribute'], new_prop['value'], + new_prop['source']), + level=log.DEBUG) + + return properties + def parse_synonyms(self, sel): """ This function scrapes the list of Names and Identifiers From b4a724392bb336885a719a4a2516d1daddf88eab Mon Sep 17 00:00:00 2001 From: RTB Date: Tue, 10 Jun 2014 14:04:39 +0200 Subject: [PATCH 07/27] moved scraping of experimental data tab to its own function --- FourmiCrawler/sources/ChemSpider.py | 58 +++++++++++++++-------------- 1 file changed, 30 insertions(+), 28 deletions(-) diff --git a/FourmiCrawler/sources/ChemSpider.py b/FourmiCrawler/sources/ChemSpider.py index ac6d5eb..95a1dee 100644 --- a/FourmiCrawler/sources/ChemSpider.py +++ b/FourmiCrawler/sources/ChemSpider.py @@ -67,28 +67,7 @@ class ChemSpider(Source): properties = [] properties.extend(self.parse_acdlabstab(sel)) - - # Experimental Data Tab, Physico-chemical properties in particular - scraped_list = sel.xpath('.//li[span="Experimental Physico-chemical ' - 'Properties"]//li/table/tr/td') - if not scraped_list: - return properties - # Format is: property name followed by a list of values - property_name = scraped_list.pop(0).xpath( - 'span/text()').extract()[0].rstrip() - for line in scraped_list: - if line.xpath('span/text()'): - property_name = line.xpath('span/text()').extract()[0].rstrip() - else: - new_prop = self.newresult( - attribute=property_name[:-1], - value=line.xpath('text()').extract()[0].rstrip(), - source=line.xpath('strong/text()').extract()[0].rstrip(), - ) - properties.append(new_prop) - log.msg('CS prop: |%s| |%s| |%s|' % - (new_prop['attribute'], new_prop['value'], - new_prop['source']), level=log.DEBUG) + properties.extend(self.parse_experimentaldatatab(sel)) return properties @@ -115,13 +94,11 @@ class ChemSpider(Source): (prop_name == 'Polarizability' and prop_value == '10-24cm3')): continue - # Match for condition in parentheses m = re.match(r'(.*) \((.*)\)', prop_name) if m: prop_name = m.group(1) prop_conditions = m.group(2) - # Match for condition in value seperated by an 'at' m = re.match(r'(.*) at (.*)', prop_value) if m: prop_value = m.group(1) @@ -134,10 +111,35 @@ class ChemSpider(Source): conditions=prop_conditions ) properties.append(new_prop) - log.msg('CS prop: |%s| |%s| |%s|' % - (new_prop['attribute'], new_prop['value'], - new_prop['source']), - level=log.DEBUG) + + return properties + + def parse_experimentaldatatab(self, sel): + """ + This function scrapes Experimental Data tab, Physico-chemical + properties in particular. + :param sel: a Selector object of the whole page + :return: a list of Result items + """ + properties = [] + + scraped_list = sel.xpath('.//li[span="Experimental Physico-chemical ' + 'Properties"]//li/table/tr/td') + if not scraped_list: + return properties + # Format is: property name followed by a list of values + property_name = scraped_list.pop(0).xpath( + 'span/text()').extract()[0].rstrip() + for line in scraped_list: + if line.xpath('span/text()'): + property_name = line.xpath('span/text()').extract()[0].rstrip() + else: + new_prop = self.newresult( + attribute=property_name[:-1], + value=line.xpath('text()').extract()[0].rstrip(), + source=line.xpath('strong/text()').extract()[0].rstrip(), + ) + properties.append(new_prop) return properties From 0fab626a55467ae410ee97496989c7b92f2083c5 Mon Sep 17 00:00:00 2001 From: RTB Date: Tue, 10 Jun 2014 14:07:02 +0200 Subject: [PATCH 08/27] moved import and removed TODO --- FourmiCrawler/sources/ChemSpider.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/FourmiCrawler/sources/ChemSpider.py b/FourmiCrawler/sources/ChemSpider.py index 95a1dee..ff7c790 100644 --- a/FourmiCrawler/sources/ChemSpider.py +++ b/FourmiCrawler/sources/ChemSpider.py @@ -1,5 +1,3 @@ -import re - from scrapy import log from scrapy.http import Request from scrapy.selector import Selector @@ -7,9 +5,9 @@ from scrapy.selector import Selector from source import Source from FourmiCrawler.items import Result +import re # [TODO] - Maybe clean up usage of '.extract()[0]', because of possible IndexError exception. -# [TODO] - Add checks at search request and extendedCompoundInfo on whether the token was valid or not class ChemSpider(Source): """ From d657f9420fe6b8348312b968af53d83a79b6a42e Mon Sep 17 00:00:00 2001 From: RTB Date: Tue, 17 Jun 2014 20:55:28 +0200 Subject: [PATCH 09/27] added absolute path to reading sources.cfg --- utils/configurator.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/utils/configurator.py b/utils/configurator.py index 62987c6..b443529 100644 --- a/utils/configurator.py +++ b/utils/configurator.py @@ -1,7 +1,7 @@ import ConfigParser from scrapy.utils.project import get_project_settings - +import os class Configurator: """ @@ -66,8 +66,11 @@ class Configurator: variables for sources :return a ConfigParser object of sources.cfg """ + current_dir = os.path.dirname(os.path.abspath(__file__)) + config_path = current_dir + '\..\sources.cfg' + # [TODO]: location of sources.cfg should be softcoded eventually config = ConfigParser.ConfigParser() - config.read('sources.cfg') # [TODO]: should be softcoded eventually + config.read(config_path) return config @staticmethod From 3ac2a8c16280db7a4ddb00aa988e13b6274427b8 Mon Sep 17 00:00:00 2001 From: RTB Date: Tue, 17 Jun 2014 20:55:44 +0200 Subject: [PATCH 10/27] sample sources.cfg --- sources.cfg.sample | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 sources.cfg.sample diff --git a/sources.cfg.sample b/sources.cfg.sample new file mode 100644 index 0000000..a94053c --- /dev/null +++ b/sources.cfg.sample @@ -0,0 +1,19 @@ +[DEFAULT] +reliability = Unknown + +#For each source listed in FourmiCrawler/sources there should be a section +#named exactly as the filename in here. If not present, the DEFAULT value is +#used for reliability of that source. + +[ChemSpider] +reliability = High +#token=Paste ChemSpider API token here and remove the hashtag + +[NIST] +reliability = High + +[WikipediaParser] +reliability = Medium + +[PubChem] +reliability = High From 02f226a8f8697062062a68c2f84899fc748ea322 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Tue, 17 Jun 2014 21:53:37 +0200 Subject: [PATCH 11/27] PubChem was added --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index f09f77c..a661409 100644 --- a/README.md +++ b/README.md @@ -48,7 +48,6 @@ __Main goals:__ - Build an graphical user interface(GUI) as alternative for the command line interface(CLI). (Assignee: Harmen) - Compiling the source into an windows executable. (Assignee: Bas) -- Create an module to gather data from PubChem. (Assignee: Nout) __Side goals:__ From 30c7b5fbf8827f25b715c69c99d6c6818f02ed7d Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Tue, 17 Jun 2014 21:54:08 +0200 Subject: [PATCH 12/27] Gammar error --- fourmi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fourmi.py b/fourmi.py index 9408818..c7d2d4d 100755 --- a/fourmi.py +++ b/fourmi.py @@ -1,6 +1,6 @@ #!/usr/bin/env python """ -Fourmi, a web scraper build to search specific information for a given compound (and it's pseudonyms). +Fourmi, a web scraper build to search specific information for a given compound (and its pseudonyms). Usage: fourmi search From d8841b2f286651842a6b3090bfaad7c6350ac6e4 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Wed, 18 Jun 2014 00:10:42 +0200 Subject: [PATCH 13/27] Added the changes so far into the changelog --- Changelog.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Changelog.md b/Changelog.md index b1885f6..8957eee 100644 --- a/Changelog.md +++ b/Changelog.md @@ -1,3 +1,7 @@ +### v0.6.0 +- FIX: Using absolute path for configuration files +- DEV: General Code cleanup in documentation + ### v0.5.3 - FIX: It is now again possible to use both verbose and the source inclusion/exclusion options - FIX: Logging is now "actually" disabled if not using the verbose option. From 2cefcfdb133402f16f38ac9548e69e50e7cc3175 Mon Sep 17 00:00:00 2001 From: RTB Date: Thu, 19 Jun 2014 12:46:09 +0200 Subject: [PATCH 14/27] made parse_searchrequest function to parse search page and modified new_compound_request accordingly --- FourmiCrawler/sources/PubChem.py | 37 +++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py index fc8250b..08f8347 100644 --- a/FourmiCrawler/sources/PubChem.py +++ b/FourmiCrawler/sources/PubChem.py @@ -106,6 +106,41 @@ class PubChem(Source): return requests + def parse_searchrequest(self, response): + """ + This function parses the response to the new_compound_request Request + :param response: the Response object to be parsed + :return: A Request for the compound page or what self.parse returns in + case the search request forwarded to the compound page + """ + + #check if pubchem forwarded straight to compound page + m = re.match(self.website_pubchem, response.url) + if m: + log.msg('PubChem search forwarded to compound page', + level=log.DEBUG) + return self.parse(response) + + sel = Selector(response) + + results = sel.xpath('//div[@class="rsltcont"]') + if results: + url = results[0].xpath('div/p/a[1]/@href') + else: + log.msg('PubChem search found nothing or xpath failed', + level=log.DEBUG) + return None + + if url: + url = 'http:' + ''.join(url[0].extract()) + log.msg('PubChem compound page: %s' % url, level=log.DEBUG) + else: + log.msg('PubChem search found results, but no url in first result', + level=log.DEBUG) + return None + + return Request(url=url, callback=self.parse) def new_compound_request(self, compound): - return Request(url=self.website_www[:-1] + self.search % compound, callback=self.parse) + return Request(url=self.website_www[:-1] + self.search % compound, + callback=self.parse_searchrequest) From 1fb8450367e2cb8640f0c7c4a3eb069be09330ec Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Thu, 19 Jun 2014 21:05:17 +0200 Subject: [PATCH 15/27] The cool folder seperators! --- utils/configurator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/utils/configurator.py b/utils/configurator.py index b443529..358adc7 100644 --- a/utils/configurator.py +++ b/utils/configurator.py @@ -1,7 +1,7 @@ import ConfigParser +import os from scrapy.utils.project import get_project_settings -import os class Configurator: """ @@ -67,7 +67,7 @@ class Configurator: :return a ConfigParser object of sources.cfg """ current_dir = os.path.dirname(os.path.abspath(__file__)) - config_path = current_dir + '\..\sources.cfg' + config_path = current_dir + '/../sources.cfg' # [TODO]: location of sources.cfg should be softcoded eventually config = ConfigParser.ConfigParser() config.read(config_path) From 576683dcd0376440c04f483b820aeb7762dade27 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Thu, 19 Jun 2014 22:01:35 +0200 Subject: [PATCH 16/27] These regular expressions where all wrong --- FourmiCrawler/sources/ChemSpider.py | 7 ++++--- FourmiCrawler/sources/NIST.py | 4 ++-- FourmiCrawler/sources/PubChem.py | 12 +++++++----- FourmiCrawler/sources/WikipediaParser.py | 4 ++-- FourmiCrawler/sources/source.py | 4 ++-- 5 files changed, 17 insertions(+), 14 deletions(-) diff --git a/FourmiCrawler/sources/ChemSpider.py b/FourmiCrawler/sources/ChemSpider.py index 5920b85..23b25fe 100644 --- a/FourmiCrawler/sources/ChemSpider.py +++ b/FourmiCrawler/sources/ChemSpider.py @@ -1,3 +1,5 @@ +import re + from scrapy import log from scrapy.http import Request from scrapy.selector import Selector @@ -5,7 +7,6 @@ from scrapy.selector import Selector from source import Source from FourmiCrawler.items import Result -import re # [TODO] - Maybe clean up usage of '.extract()[0]', because of possible IndexError exception. @@ -18,7 +19,7 @@ class ChemSpider(Source): somewhere. """ - website = 'http://www.chemspider.com/*' + website = 'http://www\.chemspider\.com/.*' search = 'Search.asmx/SimpleSearch?query=%s&token=' structure = 'Chemical-Structure.%s.html' @@ -292,6 +293,6 @@ class ChemSpider(Source): """ if compound in self.ignore_list or self.cfg['token'] == '': return None - searchurl = self.website[:-1] + self.search % compound + searchurl = self.website[:-2] + self.search % compound log.msg('chemspider compound', level=log.DEBUG) return Request(url=searchurl, callback=self.parse_searchrequest) diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py index c136b80..904df80 100644 --- a/FourmiCrawler/sources/NIST.py +++ b/FourmiCrawler/sources/NIST.py @@ -18,7 +18,7 @@ class NIST(Source): This plugin manages searching for a chemical on the NIST website and parsing the resulting page if the chemical exists on NIST. """ - website = "http://webbook.nist.gov/*" + website = "http://webbook\.nist\.gov/.*" search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on' @@ -329,5 +329,5 @@ class NIST(Source): """ if compound not in self.ignore_list: self.ignore_list.update(compound) - return Request(url=self.website[:-1] + self.search % compound, + return Request(url=self.website[:-2] + self.search % compound, callback=self.parse) diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py index 08f8347..521b02d 100644 --- a/FourmiCrawler/sources/PubChem.py +++ b/FourmiCrawler/sources/PubChem.py @@ -1,9 +1,11 @@ +import re + from scrapy.http import Request from scrapy import log -from source import Source from scrapy.selector import Selector + +from source import Source from FourmiCrawler.items import Result -import re class PubChem(Source): @@ -14,9 +16,9 @@ class PubChem(Source): """ #PubChem has its data on compound name, properties and their values on different html pages, so different URLs used - website = 'https://*.ncbi.nlm.nih.gov/*' - website_www = 'https://www.ncbi.nlm.nih.gov/*' - website_pubchem = 'https://pubchem.ncbi.nlm.nih.gov/*' + website = 'https://.*\.ncbi\.nlm\.nih\.gov/.*' + website_www = 'https://www.ncbi.nlm.nih.gov/.*' + website_pubchem = 'https://pubchem.ncbi.nlm.nih.gov/.*' search = 'pccompound?term=%s' data_url = 'toc/summary_toc.cgi?tocid=27&cid=%s' diff --git a/FourmiCrawler/sources/WikipediaParser.py b/FourmiCrawler/sources/WikipediaParser.py index 401698c..385311c 100644 --- a/FourmiCrawler/sources/WikipediaParser.py +++ b/FourmiCrawler/sources/WikipediaParser.py @@ -15,7 +15,7 @@ class WikipediaParser(Source): It also returns requests with other external sources which contain information on parsed subject. """ - website = "http://en.wikipedia.org/wiki/*" + website = "http://en\.wikipedia\.org/wiki/.*" __spider = None searched_compounds = [] @@ -123,7 +123,7 @@ class WikipediaParser(Source): return items def new_compound_request(self, compound): - return Request(url=self.website[:-1] + compound, callback=self.parse) + return Request(url=self.website[:-2] + compound, callback=self.parse) @staticmethod def clean_items(items): diff --git a/FourmiCrawler/sources/source.py b/FourmiCrawler/sources/source.py index 36218b0..3ffb47d 100644 --- a/FourmiCrawler/sources/source.py +++ b/FourmiCrawler/sources/source.py @@ -3,7 +3,7 @@ from scrapy import log class Source: - website = "http://something/*" # Regex of URI's the source is able to parse + website = "http://something/.*" # Regex of URI's the source is able to parse _spider = None def __init__(self, config=None): @@ -30,7 +30,7 @@ class Source: :param compound: A compound name. :return: A new Scrapy Request """ - # return Request(url=self.website[:-1] + compound, callback=self.parse) + # return Request(url=self.website[:-2] + compound, callback=self.parse) pass def set_spider(self, spider): From ef1c3193966e9e64f53e5cb5af8ec17791f37aae Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Thu, 19 Jun 2014 22:05:21 +0200 Subject: [PATCH 17/27] Escape escape characters --- FourmiCrawler/settings.py | 4 ++-- FourmiCrawler/sources/ChemSpider.py | 4 ++-- FourmiCrawler/sources/NIST.py | 4 ++-- FourmiCrawler/sources/PubChem.py | 4 ++-- FourmiCrawler/sources/WikipediaParser.py | 4 ++-- FourmiCrawler/sources/source.py | 2 +- 6 files changed, 11 insertions(+), 11 deletions(-) diff --git a/FourmiCrawler/settings.py b/FourmiCrawler/settings.py index 338f224..ace60ab 100644 --- a/FourmiCrawler/settings.py +++ b/FourmiCrawler/settings.py @@ -23,5 +23,5 @@ FEED_FORMAT = 'jsonlines' # [todo] - Check for repercussions on spoofing the user agent -# USER_AGENT = 'FourmiCrawler (+http://www.yourdomain.com)' -USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36' +USER_AGENT = 'Fourmi' +# USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36' diff --git a/FourmiCrawler/sources/ChemSpider.py b/FourmiCrawler/sources/ChemSpider.py index 23b25fe..6ca5382 100644 --- a/FourmiCrawler/sources/ChemSpider.py +++ b/FourmiCrawler/sources/ChemSpider.py @@ -19,7 +19,7 @@ class ChemSpider(Source): somewhere. """ - website = 'http://www\.chemspider\.com/.*' + website = 'http://www\\.chemspider\\.com/.*' search = 'Search.asmx/SimpleSearch?query=%s&token=' structure = 'Chemical-Structure.%s.html' @@ -293,6 +293,6 @@ class ChemSpider(Source): """ if compound in self.ignore_list or self.cfg['token'] == '': return None - searchurl = self.website[:-2] + self.search % compound + searchurl = self.website[:-2].replace("\\", "") + self.search % compound log.msg('chemspider compound', level=log.DEBUG) return Request(url=searchurl, callback=self.parse_searchrequest) diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py index 904df80..4ad93f5 100644 --- a/FourmiCrawler/sources/NIST.py +++ b/FourmiCrawler/sources/NIST.py @@ -18,7 +18,7 @@ class NIST(Source): This plugin manages searching for a chemical on the NIST website and parsing the resulting page if the chemical exists on NIST. """ - website = "http://webbook\.nist\.gov/.*" + website = "http://webbook\\.nist\\.gov/.*" search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on' @@ -329,5 +329,5 @@ class NIST(Source): """ if compound not in self.ignore_list: self.ignore_list.update(compound) - return Request(url=self.website[:-2] + self.search % compound, + return Request(url=self.website[:-2].replace("\\", "") + self.search % compound, callback=self.parse) diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py index 521b02d..5947e54 100644 --- a/FourmiCrawler/sources/PubChem.py +++ b/FourmiCrawler/sources/PubChem.py @@ -16,8 +16,8 @@ class PubChem(Source): """ #PubChem has its data on compound name, properties and their values on different html pages, so different URLs used - website = 'https://.*\.ncbi\.nlm\.nih\.gov/.*' - website_www = 'https://www.ncbi.nlm.nih.gov/.*' + website = 'https://.*\\.ncbi\\.nlm\\.nih\\.gov/.*' + website_www = 'https://www.ncbi.nlm.nih.gov/*' website_pubchem = 'https://pubchem.ncbi.nlm.nih.gov/.*' search = 'pccompound?term=%s' data_url = 'toc/summary_toc.cgi?tocid=27&cid=%s' diff --git a/FourmiCrawler/sources/WikipediaParser.py b/FourmiCrawler/sources/WikipediaParser.py index 385311c..e27bb39 100644 --- a/FourmiCrawler/sources/WikipediaParser.py +++ b/FourmiCrawler/sources/WikipediaParser.py @@ -15,7 +15,7 @@ class WikipediaParser(Source): It also returns requests with other external sources which contain information on parsed subject. """ - website = "http://en\.wikipedia\.org/wiki/.*" + website = "http://en\\.wikipedia\\.org/wiki/.*" __spider = None searched_compounds = [] @@ -123,7 +123,7 @@ class WikipediaParser(Source): return items def new_compound_request(self, compound): - return Request(url=self.website[:-2] + compound, callback=self.parse) + return Request(url=self.website[:-2].replace("\\", "") + compound, callback=self.parse) @staticmethod def clean_items(items): diff --git a/FourmiCrawler/sources/source.py b/FourmiCrawler/sources/source.py index 3ffb47d..a0d3dcd 100644 --- a/FourmiCrawler/sources/source.py +++ b/FourmiCrawler/sources/source.py @@ -30,7 +30,7 @@ class Source: :param compound: A compound name. :return: A new Scrapy Request """ - # return Request(url=self.website[:-2] + compound, callback=self.parse) + # return Request(url=self.website[:-2].replace("\\", "") + compound, callback=self.parse) pass def set_spider(self, spider): From 27529c414f18c3332407288bd01c03c0cea68c24 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Thu, 19 Jun 2014 22:06:55 +0200 Subject: [PATCH 18/27] Fourmi as our USER_AGENT --- FourmiCrawler/settings.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/FourmiCrawler/settings.py b/FourmiCrawler/settings.py index ace60ab..e82c8e6 100644 --- a/FourmiCrawler/settings.py +++ b/FourmiCrawler/settings.py @@ -21,7 +21,4 @@ FEED_FORMAT = 'jsonlines' # Crawl responsibly by identifying yourself (and your website) on the # user-agent -# [todo] - Check for repercussions on spoofing the user agent - USER_AGENT = 'Fourmi' -# USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36' From a3e973ecadebb963a645318008f92e949d50dfbf Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Thu, 19 Jun 2014 22:08:45 +0200 Subject: [PATCH 19/27] Added INFO message when no compatible source on response --- FourmiCrawler/spider.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/FourmiCrawler/spider.py b/FourmiCrawler/spider.py index ebfd2cf..32181ce 100644 --- a/FourmiCrawler/spider.py +++ b/FourmiCrawler/spider.py @@ -34,8 +34,9 @@ class FourmiSpider(Spider): """ for source in self._sources: if re.match(source.website, response.url): - log.msg("Url: " + response.url + " -> Source: " + source.website, level=log.DEBUG) + log.msg("URL: " + response.url + " -> Source: " + source.website, level=log.DEBUG) return source.parse(response) + log.msg("URL: " + response.url + " -> No compatible source", level=log.INFO) return None def get_synonym_requests(self, compound, force=False): From 093eba8b0469a5223911f3f503db493e775c6992 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Thu, 19 Jun 2014 22:26:16 +0200 Subject: [PATCH 20/27] Other occurences of website REGEX --- FourmiCrawler/sources/ChemSpider.py | 4 ++-- FourmiCrawler/sources/NIST.py | 2 +- FourmiCrawler/sources/PubChem.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/FourmiCrawler/sources/ChemSpider.py b/FourmiCrawler/sources/ChemSpider.py index 6ca5382..b4bf6f0 100644 --- a/FourmiCrawler/sources/ChemSpider.py +++ b/FourmiCrawler/sources/ChemSpider.py @@ -277,8 +277,8 @@ class ChemSpider(Source): log.msg('ChemSpider found multiple substances, taking first ' 'element', level=log.DEBUG) csid = csids[0] - structure_url = self.website[:-1] + self.structure % csid - extendedinfo_url = self.website[:-1] + self.extendedinfo % csid + structure_url = self.website[:-2].replace("\\", "") + self.structure % csid + extendedinfo_url = self.website[:-2].replace("\\", "") + self.extendedinfo % csid log.msg('chemspider URL: %s' % structure_url, level=log.DEBUG) return [Request(url=structure_url, callback=self.parse), diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py index 4ad93f5..691b062 100644 --- a/FourmiCrawler/sources/NIST.py +++ b/FourmiCrawler/sources/NIST.py @@ -164,7 +164,7 @@ class NIST(Source): extra_data_url = tr.xpath('td[last()][a="Individual data points"]' '/a/@href').extract() if extra_data_url: - request = Request(url=self.website[:-1] + extra_data_url[0], + request = Request(url=self.website[:-2].replace("\\", "") + extra_data_url[0], callback=self.parse_individual_datapoints) results.append(request) continue diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py index 5947e54..0768612 100644 --- a/FourmiCrawler/sources/PubChem.py +++ b/FourmiCrawler/sources/PubChem.py @@ -51,7 +51,7 @@ class PubChem(Source): self._spider.get_synonym_requests(synonym) log.msg('Raw synonyms found: %s' % raw_synonyms, level=log.DEBUG) - n = re.search(r'cid=(\d+)',response.url) + n = re.search(r'cid=(\d+)', response.url) if n: cid = n.group(1) log.msg('cid: %s' % cid, level=log.DEBUG) #getting the right id of the compound with which it can reach From d7d2a659b12e351cb246dcb2a49bd4dd43eeb67a Mon Sep 17 00:00:00 2001 From: RTB Date: Thu, 19 Jun 2014 22:34:53 +0200 Subject: [PATCH 21/27] changed https to http in PubChem.py --- FourmiCrawler/sources/PubChem.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py index 0768612..0bc8b8c 100644 --- a/FourmiCrawler/sources/PubChem.py +++ b/FourmiCrawler/sources/PubChem.py @@ -16,9 +16,9 @@ class PubChem(Source): """ #PubChem has its data on compound name, properties and their values on different html pages, so different URLs used - website = 'https://.*\\.ncbi\\.nlm\\.nih\\.gov/.*' - website_www = 'https://www.ncbi.nlm.nih.gov/*' - website_pubchem = 'https://pubchem.ncbi.nlm.nih.gov/.*' + website = 'http://.*\\.ncbi\\.nlm\\.nih\\.gov/.*' + website_www = 'http://www.ncbi.nlm.nih.gov/*' + website_pubchem = 'http://pubchem.ncbi.nlm.nih.gov/.*' search = 'pccompound?term=%s' data_url = 'toc/summary_toc.cgi?tocid=27&cid=%s' From 229091520999cf7215120e45f1b719d94dea34e2 Mon Sep 17 00:00:00 2001 From: RTB Date: Thu, 19 Jun 2014 22:45:01 +0200 Subject: [PATCH 22/27] fixed forgotten self.website usage --- FourmiCrawler/sources/PubChem.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py index 0bc8b8c..15fa3f9 100644 --- a/FourmiCrawler/sources/PubChem.py +++ b/FourmiCrawler/sources/PubChem.py @@ -58,7 +58,7 @@ class PubChem(Source): # the seperate html page which contains the properties and their values #using this cid to get the right url and scrape it - requests.append(Request(url=self.website_pubchem[:-1] + self.data_url % cid, callback=self.parse_data)) + requests.append(Request(url=self.website_pubchem[:-2].replace("\\","") + self.data_url % cid, callback=self.parse_data)) return requests def parse_data(self, response): From 98f63a212a989ce612cc2d0fd04349c2ad9531d0 Mon Sep 17 00:00:00 2001 From: RTB Date: Thu, 19 Jun 2014 22:57:17 +0200 Subject: [PATCH 23/27] modified pubchem.py to use reliability from sources.cfg --- FourmiCrawler/sources/PubChem.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py index 15fa3f9..4468da5 100644 --- a/FourmiCrawler/sources/PubChem.py +++ b/FourmiCrawler/sources/PubChem.py @@ -82,7 +82,7 @@ class PubChem(Source): 'attribute': prop_name, 'value': prop_value, 'source': prop_source, - 'reliability': 'Unknown', + 'reliability': self.cfg['reliability'], 'conditions': '' }) log.msg('PubChem prop: |%s| |%s| |%s|' % @@ -98,7 +98,7 @@ class PubChem(Source): 'attribute': prop_name, 'value': prop_value, 'source': prop_source, - 'reliability': 'Unknown', + 'reliability': self.cfg['reliability'], 'conditions': '' }) log.msg('PubChem prop: |%s| |%s| |%s|' % From 6c0b55dab611d86bc0414362fd83ac62ff5abeed Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Fri, 20 Jun 2014 11:14:51 +0200 Subject: [PATCH 24/27] Edited the actual functions --- fourmi.py | 4 ++-- utils/configurator.py | 9 +++++---- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/fourmi.py b/fourmi.py index e45d605..d6d5fd9 100755 --- a/fourmi.py +++ b/fourmi.py @@ -17,7 +17,7 @@ Options: --version Show version. -v Verbose logging output. (Multiple occurrences increase logging level) --log= Save log to an file. - -o --output= Output file [default: results.*format*] + -o --output= Output file [default: .*format*] -f --format= Output formats (supported: csv, json, jsonlines, xml) [default: csv] --include= Include only sources that match these regular expressions split by a comma. --exclude= Exclude the sources that match these regular expressions split by a comma. @@ -58,7 +58,7 @@ def search(docopt_arguments, source_loader): """ conf = Configurator() conf.set_logging(docopt_arguments["--log"], docopt_arguments["-v"]) - conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"]) + conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"], docopt_arguments[""]) setup_crawler(docopt_arguments[""], conf.scrapy_settings, source_loader, docopt_arguments["--attributes"].split(',')) if conf.scrapy_settings.getbool("LOG_ENABLED"): diff --git a/utils/configurator.py b/utils/configurator.py index 358adc7..2db7cdb 100644 --- a/utils/configurator.py +++ b/utils/configurator.py @@ -3,6 +3,7 @@ import os from scrapy.utils.project import get_project_settings + class Configurator: """ A helper class in the fourmi class. This class is used to process the settings as set @@ -12,7 +13,7 @@ class Configurator: def __init__(self): self.scrapy_settings = get_project_settings() - def set_output(self, filename, fileformat): + def set_output(self, filename, fileformat, compound): """ This function manipulates the Scrapy output file settings that normally would be set in the settings file. In the Fourmi project these are command line arguments. @@ -20,12 +21,12 @@ class Configurator: :param fileformat: The format in which the output will be. """ - if filename != 'results.*format*': + if filename != '.*format*': self.scrapy_settings.overrides["FEED_URI"] = filename elif fileformat == "jsonlines": - self.scrapy_settings.overrides["FEED_URI"] = "results.json" + self.scrapy_settings.overrides["FEED_URI"] = compound + ".json" elif fileformat is not None: - self.scrapy_settings.overrides["FEED_URI"] = "results." + fileformat + self.scrapy_settings.overrides["FEED_URI"] = compound + "." + fileformat if fileformat is not None: self.scrapy_settings.overrides["FEED_FORMAT"] = fileformat From 87275a6dc8b1736b3518969368fc942a796a4df4 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Fri, 20 Jun 2014 11:16:56 +0200 Subject: [PATCH 25/27] Edited the tests --- tests/test_configurator.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/test_configurator.py b/tests/test_configurator.py index df29da9..0eb593d 100644 --- a/tests/test_configurator.py +++ b/tests/test_configurator.py @@ -10,16 +10,16 @@ class TestConfigurator(unittest.TestCase): self.conf = Configurator() def test_set_output(self): - self.conf.set_output(filename="test.txt", fileformat="csv") + self.conf.set_output(filename="test.txt", fileformat="csv", compound="test") self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.txt") self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv") - self.conf.set_output("results.*format*", "jsonlines") - self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.json") + self.conf.set_output(".*format*", "jsonlines", "test") + self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.json") self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "jsonlines") - self.conf.set_output("results.*format*", "csv") - self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.csv") + self.conf.set_output("results.*format*", "csv", "test") + self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.csv") self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv") def test_start_log(self): From 22ca4afa33058781330fd125b61e23281dcb0c4d Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Fri, 20 Jun 2014 11:21:26 +0200 Subject: [PATCH 26/27] Code inspection --- FourmiCrawler/sources/ChemSpider.py | 18 +++++++++--------- FourmiCrawler/sources/NIST.py | 13 +++++++------ FourmiCrawler/sources/PubChem.py | 26 ++++++++++++++------------ fourmi.py | 2 +- 4 files changed, 31 insertions(+), 28 deletions(-) diff --git a/FourmiCrawler/sources/ChemSpider.py b/FourmiCrawler/sources/ChemSpider.py index b4bf6f0..e95d067 100644 --- a/FourmiCrawler/sources/ChemSpider.py +++ b/FourmiCrawler/sources/ChemSpider.py @@ -89,7 +89,7 @@ class ChemSpider(Source): # Test for properties without values, with one hardcoded exception if (not re.match(r'^\d', prop_value) or - (prop_name == 'Polarizability' and prop_value == '10-24cm3')): + (prop_name == 'Polarizability' and prop_value == '10-24cm3')): continue m = re.match(r'(.*) \((.*)\)', prop_name) @@ -122,12 +122,12 @@ class ChemSpider(Source): properties = [] scraped_list = sel.xpath('.//li[span="Experimental Physico-chemical ' - 'Properties"]//li/table/tr/td') + 'Properties"]//li/table/tr/td') if not scraped_list: return properties # Format is: property name followed by a list of values property_name = scraped_list.pop(0).xpath( - 'span/text()').extract()[0].rstrip() + 'span/text()').extract()[0].rstrip() for line in scraped_list: if line.xpath('span/text()'): property_name = line.xpath('span/text()').extract()[0].rstrip() @@ -251,12 +251,12 @@ class ChemSpider(Source): :return: A Result item """ return Result({ - 'attribute': attribute, - 'value': value, - 'source': source, - 'reliability': self.cfg['reliability'], - 'conditions': conditions - }) + 'attribute': attribute, + 'value': value, + 'source': source, + 'reliability': self.cfg['reliability'], + 'conditions': conditions + }) def parse_searchrequest(self, response): """ diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py index 691b062..52f1332 100644 --- a/FourmiCrawler/sources/NIST.py +++ b/FourmiCrawler/sources/NIST.py @@ -313,12 +313,13 @@ class NIST(Source): :param conditions: optional conditions regarding the value :return: A Result item """ - return Result({ - 'attribute': attribute, - 'value': value, - 'source': 'NIST', - 'reliability': self.cfg['reliability'], - 'conditions': conditions + return Result( + { + 'attribute': attribute, + 'value': value, + 'source': 'NIST', + 'reliability': self.cfg['reliability'], + 'conditions': conditions }) def new_compound_request(self, compound): diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py index 15fa3f9..4cd5304 100644 --- a/FourmiCrawler/sources/PubChem.py +++ b/FourmiCrawler/sources/PubChem.py @@ -15,7 +15,7 @@ class PubChem(Source): including sources of the values of properties. """ - #PubChem has its data on compound name, properties and their values on different html pages, so different URLs used + # PubChem has its data on compound name, properties and their values on different html pages, so different URLs used website = 'http://.*\\.ncbi\\.nlm\\.nih\\.gov/.*' website_www = 'http://www.ncbi.nlm.nih.gov/*' website_pubchem = 'http://pubchem.ncbi.nlm.nih.gov/.*' @@ -54,14 +54,16 @@ class PubChem(Source): n = re.search(r'cid=(\d+)', response.url) if n: cid = n.group(1) - log.msg('cid: %s' % cid, level=log.DEBUG) #getting the right id of the compound with which it can reach - # the seperate html page which contains the properties and their values + log.msg('cid: %s' % cid, level=log.DEBUG) # getting the right id of the compound with which it can reach + # the seperate html page which contains the properties and their values - #using this cid to get the right url and scrape it - requests.append(Request(url=self.website_pubchem[:-2].replace("\\","") + self.data_url % cid, callback=self.parse_data)) + # using this cid to get the right url and scrape it + requests.append( + Request(url=self.website_pubchem[:-2].replace("\\", "") + self.data_url % cid, callback=self.parse_data)) return requests - def parse_data(self, response): + @staticmethod + def parse_data(response): """ Parse data found in 'Chemical and Physical properties' part of a substance page. :param response: The response with the page to parse @@ -74,8 +76,8 @@ class PubChem(Source): props = sel.xpath('//div') for prop in props: - prop_name = ''.join(prop.xpath('b/text()').extract()) # name of property that it is parsing - if prop.xpath('a'): # parsing for single value in property + prop_name = ''.join(prop.xpath('b/text()').extract()) # name of property that it is parsing + if prop.xpath('a'): # parsing for single value in property prop_source = ''.join(prop.xpath('a/@title').extract()) prop_value = ''.join(prop.xpath('a/text()').extract()) new_prop = Result({ @@ -89,7 +91,7 @@ class PubChem(Source): (new_prop['attribute'], new_prop['value'], new_prop['source']), level=log.DEBUG) requests.append(new_prop) - elif prop.xpath('ul'): # parsing for multiple values (list) in property + elif prop.xpath('ul'): # parsing for multiple values (list) in property prop_values = prop.xpath('ul//li') for prop_li in prop_values: prop_value = ''.join(prop_li.xpath('a/text()').extract()) @@ -102,8 +104,8 @@ class PubChem(Source): 'conditions': '' }) log.msg('PubChem prop: |%s| |%s| |%s|' % - (new_prop['attribute'], new_prop['value'], - new_prop['source']), level=log.DEBUG) + (new_prop['attribute'], new_prop['value'], + new_prop['source']), level=log.DEBUG) requests.append(new_prop) return requests @@ -116,7 +118,7 @@ class PubChem(Source): case the search request forwarded to the compound page """ - #check if pubchem forwarded straight to compound page + # check if pubchem forwarded straight to compound page m = re.match(self.website_pubchem, response.url) if m: log.msg('PubChem search forwarded to compound page', diff --git a/fourmi.py b/fourmi.py index d6d5fd9..f0caa05 100755 --- a/fourmi.py +++ b/fourmi.py @@ -63,7 +63,7 @@ def search(docopt_arguments, source_loader): source_loader, docopt_arguments["--attributes"].split(',')) if conf.scrapy_settings.getbool("LOG_ENABLED"): log.start(conf.scrapy_settings.get("LOG_FILE"), - conf.scrapy_settings.get("LOG_LEVEL"), conf.scrapy_settings.get("LOG_STDOUT")) + conf.scrapy_settings.get("LOG_LEVEL"), conf.scrapy_settings.get("LOG_STDOUT")) reactor.run() From 11422e555e1a94c27b63cea3e935ce176ae25e26 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Fri, 20 Jun 2014 11:22:55 +0200 Subject: [PATCH 27/27] Missed one test statement --- tests/test_configurator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_configurator.py b/tests/test_configurator.py index 0eb593d..2da9f83 100644 --- a/tests/test_configurator.py +++ b/tests/test_configurator.py @@ -18,7 +18,7 @@ class TestConfigurator(unittest.TestCase): self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.json") self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "jsonlines") - self.conf.set_output("results.*format*", "csv", "test") + self.conf.set_output(".*format*", "csv", "test") self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.csv") self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv")