diff --git a/Changelog.md b/Changelog.md new file mode 100644 index 0000000..b1885f6 --- /dev/null +++ b/Changelog.md @@ -0,0 +1,12 @@ +### v0.5.3 +- FIX: It is now again possible to use both verbose and the source inclusion/exclusion options +- FIX: Logging is now "actually" disabled if not using the verbose option. +- FEATURE: Added support for PubChem + +### v0.5.2 +- FIX: Signatured used to contain untracked and older files, current signature +should be correct. + +### v0.5.1 +- UPDATED: Logging functionality from command line +- DEV: Code cleanup and extra tests diff --git a/FourmiCrawler/settings.py b/FourmiCrawler/settings.py index 8c1df07..338f224 100644 --- a/FourmiCrawler/settings.py +++ b/FourmiCrawler/settings.py @@ -18,8 +18,10 @@ ITEM_PIPELINES = { FEED_URI = 'results.json' FEED_FORMAT = 'jsonlines' - # Crawl responsibly by identifying yourself (and your website) on the # user-agent +# [todo] - Check for repercussions on spoofing the user agent + # USER_AGENT = 'FourmiCrawler (+http://www.yourdomain.com)' +USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36' diff --git a/FourmiCrawler/sources/ChemSpider.py b/FourmiCrawler/sources/ChemSpider.py index 87a6ee7..0110e57 100644 --- a/FourmiCrawler/sources/ChemSpider.py +++ b/FourmiCrawler/sources/ChemSpider.py @@ -26,9 +26,8 @@ class ChemSpider(Source): structure = 'Chemical-Structure.%s.html' extendedinfo = 'MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token=' - def __init__(self, config={}): + def __init__(self, config=None): Source.__init__(self, config) - self.cfg = config self.ignore_list = [] if 'token' not in self.cfg or self.cfg['token'] == '': log.msg('ChemSpider token not set or empty, search/MassSpec API ' @@ -37,7 +36,6 @@ class ChemSpider(Source): self.search += self.cfg['token'] self.extendedinfo += self.cfg['token'] - def parse(self, response): sel = Selector(response) requests = [] @@ -199,13 +197,14 @@ class ChemSpider(Source): return properties def newresult(self, attribute, value, conditions='', source='ChemSpider'): - return Result({ + return Result( + { 'attribute': attribute, 'value': value, 'source': source, 'reliability': self.cfg['reliability'], 'conditions': conditions - }) + }) def parse_searchrequest(self, response): """Parse the initial response of the ChemSpider Search API """ diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py index 3c323ef..934b457 100644 --- a/FourmiCrawler/sources/NIST.py +++ b/FourmiCrawler/sources/NIST.py @@ -22,12 +22,9 @@ class NIST(Source): search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on' - cfg = {} - - def __init__(self, config={}): + def __init__(self, config=None): Source.__init__(self, config) self.ignore_list = set() - self.cfg = config def parse(self, response): sel = Selector(response) @@ -88,7 +85,6 @@ class NIST(Source): InChiKey, CAS number """ ul = sel.xpath('body/ul[li/strong="IUPAC Standard InChI:"]') - li = ul.xpath('li') raw_synonyms = ul.xpath('li[strong="Other names:"]/text()').extract() for synonym in raw_synonyms[0].strip().split(';\n'): @@ -255,12 +251,13 @@ class NIST(Source): return results def newresult(self, attribute, value, conditions=''): - return Result({ - 'attribute': attribute, - 'value': value, - 'source': 'NIST', - 'reliability': self.cfg['reliability'], - 'conditions': conditions + return Result( + { + 'attribute': attribute, + 'value': value, + 'source': 'NIST', + 'reliability': self.cfg['reliability'], + 'conditions': conditions }) def new_compound_request(self, compound): diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py new file mode 100644 index 0000000..fc8250b --- /dev/null +++ b/FourmiCrawler/sources/PubChem.py @@ -0,0 +1,111 @@ +from scrapy.http import Request +from scrapy import log +from source import Source +from scrapy.selector import Selector +from FourmiCrawler.items import Result +import re + + +class PubChem(Source): + """ PubChem scraper for chemical properties + + This parser parses the part on PubChem pages that gives Chemical and Physical properties of a substance, + including sources of the values of properties. + """ + + #PubChem has its data on compound name, properties and their values on different html pages, so different URLs used + website = 'https://*.ncbi.nlm.nih.gov/*' + website_www = 'https://www.ncbi.nlm.nih.gov/*' + website_pubchem = 'https://pubchem.ncbi.nlm.nih.gov/*' + search = 'pccompound?term=%s' + data_url = 'toc/summary_toc.cgi?tocid=27&cid=%s' + + __spider = None + searched_compounds = set() + + def __init__(self, config): + Source.__init__(self, config) + self.cfg = config + + def parse(self, response): + """ + Distributes the above described behaviour + :param response: The incoming search request + :return Returns the found properties if response is unique or returns none if it's already known + """ + requests = [] + log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG) + + sel = Selector(response) + compound = sel.xpath('//h1/text()').extract()[0] + if compound in self.searched_compounds: + return None + + self.searched_compounds.update(compound) + raw_synonyms = sel.xpath('//div[@class="smalltext"]/text()').extract()[0] + for synonym in raw_synonyms.strip().split(', '): + log.msg('PubChem synonym found: %s' % synonym, level=log.DEBUG) + self.searched_compounds.update(synonym) + self._spider.get_synonym_requests(synonym) + log.msg('Raw synonyms found: %s' % raw_synonyms, level=log.DEBUG) + + n = re.search(r'cid=(\d+)',response.url) + if n: + cid = n.group(1) + log.msg('cid: %s' % cid, level=log.DEBUG) #getting the right id of the compound with which it can reach + # the seperate html page which contains the properties and their values + + #using this cid to get the right url and scrape it + requests.append(Request(url=self.website_pubchem[:-1] + self.data_url % cid, callback=self.parse_data)) + return requests + + def parse_data(self, response): + """ + Parse data found in 'Chemical and Physical properties' part of a substance page. + :param response: The response with the page to parse + :return: requests: Returns a list of properties with their values, source, etc. + """ + log.msg('parsing data', level=log.DEBUG) + requests = [] + + sel = Selector(response) + props = sel.xpath('//div') + + for prop in props: + prop_name = ''.join(prop.xpath('b/text()').extract()) # name of property that it is parsing + if prop.xpath('a'): # parsing for single value in property + prop_source = ''.join(prop.xpath('a/@title').extract()) + prop_value = ''.join(prop.xpath('a/text()').extract()) + new_prop = Result({ + 'attribute': prop_name, + 'value': prop_value, + 'source': prop_source, + 'reliability': 'Unknown', + 'conditions': '' + }) + log.msg('PubChem prop: |%s| |%s| |%s|' % + (new_prop['attribute'], new_prop['value'], + new_prop['source']), level=log.DEBUG) + requests.append(new_prop) + elif prop.xpath('ul'): # parsing for multiple values (list) in property + prop_values = prop.xpath('ul//li') + for prop_li in prop_values: + prop_value = ''.join(prop_li.xpath('a/text()').extract()) + prop_source = ''.join(prop_li.xpath('a/@title').extract()) + new_prop = Result({ + 'attribute': prop_name, + 'value': prop_value, + 'source': prop_source, + 'reliability': 'Unknown', + 'conditions': '' + }) + log.msg('PubChem prop: |%s| |%s| |%s|' % + (new_prop['attribute'], new_prop['value'], + new_prop['source']), level=log.DEBUG) + requests.append(new_prop) + + return requests + + + def new_compound_request(self, compound): + return Request(url=self.website_www[:-1] + self.search % compound, callback=self.parse) diff --git a/FourmiCrawler/sources/WikipediaParser.py b/FourmiCrawler/sources/WikipediaParser.py index 4aa49b2..401698c 100644 --- a/FourmiCrawler/sources/WikipediaParser.py +++ b/FourmiCrawler/sources/WikipediaParser.py @@ -1,9 +1,11 @@ +import re + from scrapy.http import Request from scrapy import log -from source import Source from scrapy.selector import Selector + +from source import Source from FourmiCrawler.items import Result -import re class WikipediaParser(Source): @@ -17,11 +19,8 @@ class WikipediaParser(Source): __spider = None searched_compounds = [] - cfg = {} - - def __init__(self, config={}): + def __init__(self, config=None): Source.__init__(self, config) - self.cfg = config def parse(self, response): """ @@ -53,7 +52,7 @@ class WikipediaParser(Source): # scrape the chembox (wikipedia template) items = self.parse_chembox(sel, items) - #scrape the drugbox (wikipedia template) + # scrape the drugbox (wikipedia template) items = self.parse_drugbox(sel, items) items = filter(lambda a: a['value'] != '', items) # remove items with an empty value @@ -123,7 +122,6 @@ class WikipediaParser(Source): level=log.DEBUG) return items - def new_compound_request(self, compound): return Request(url=self.website[:-1] + compound, callback=self.parse) @@ -161,10 +159,11 @@ class WikipediaParser(Source): return links def newresult(self, attribute, value): - return Result({ - 'attribute': attribute, - 'value': value, - 'source': 'Wikipedia', - 'reliability': self.cfg['reliability'], - 'conditions': '' + return Result( + { + 'attribute': attribute, + 'value': value, + 'source': 'Wikipedia', + 'reliability': self.cfg['reliability'], + 'conditions': '' }) diff --git a/FourmiCrawler/sources/source.py b/FourmiCrawler/sources/source.py index a609bb9..36218b0 100644 --- a/FourmiCrawler/sources/source.py +++ b/FourmiCrawler/sources/source.py @@ -6,10 +6,13 @@ class Source: website = "http://something/*" # Regex of URI's the source is able to parse _spider = None - def __init__(self, config={}): + def __init__(self, config=None): """ Initiation of a new Source """ + self.cfg = {} + if config is not None: + self.cfg = config pass def parse(self, response): diff --git a/FourmiCrawler/spider.py b/FourmiCrawler/spider.py index 5c09f07..ebfd2cf 100644 --- a/FourmiCrawler/spider.py +++ b/FourmiCrawler/spider.py @@ -10,7 +10,7 @@ class FourmiSpider(Spider): """ name = "FourmiSpider" - def __init__(self, compound=None, selected_attributes=[".*"], *args, **kwargs): + def __init__(self, compound=None, selected_attributes=None, *args, **kwargs): """ Initiation of the Spider :param compound: compound that will be searched. @@ -20,7 +20,10 @@ class FourmiSpider(Spider): self.synonyms = set() super(FourmiSpider, self).__init__(*args, **kwargs) self.synonyms.add(compound) - self.selected_attributes = selected_attributes + if selected_attributes is None: + self.selected_attributes = [".*"] + else: + self.selected_attributes = selected_attributes def parse(self, response): """ diff --git a/README.md b/README.md index 48b0419..f09f77c 100644 --- a/README.md +++ b/README.md @@ -23,21 +23,21 @@ documentation](http://doc.scrapy.org/en/latest/index.html). ### Installing -If you're installing Fourmi, please take a look at our [installation guide](...) -on our wiki. When you've installed the application, make sure to check our -[usage guide](...). +If you're installing Fourmi, please take a look at our installation guides +on our [wiki](https://github.com/jjdekker/Fourmi/wiki). When you've installed the application, make sure to check our +usage guide on the [Command Line Interface](https://github.com/jjdekker/Fourmi/wiki/CLI) and on the [Graphical User Interface](https://github.com/jjdekker/Fourmi/wiki/GUI). ### Using the Source To use the Fourmi source code multiple dependencies are required. Take a look at -the [wiki page](...) on using the application source code for a step by step +our [wiki pages](https://github.com/jjdekker/Fourmi/wiki) on using the application source code in our a step by step installation guide. When developing for the Fourmi project keep in mind that code readability is a must. To maintain the readability, code should be conform with the [PEP-8](http://legacy.python.org/dev/peps/pep-0008/) style guide for Python code. More information about the different structures and principles of the -Fourmi application can be found on our [wiki](...). +Fourmi application can be found on our [wiki](https://github.com/jjdekker/Fourmi/wiki). ### To Do @@ -45,13 +45,9 @@ The Fourmi project has the following goals for the nearby future: __Main goals:__ -- Improve our documentation and guides. (Assignee: Dekker) - Build an graphical user interface(GUI) as alternative for the command line interface(CLI). (Assignee: Harmen) - Compiling the source into an windows executable. (Assignee: Bas) -- Create an configuration file to hold logins and API keys. -- Determine reliability of our data point. -- Create an module to gather data from NIST. (Assignee: Rob) - Create an module to gather data from PubChem. (Assignee: Nout) __Side goals:__ diff --git a/SIGNED.md b/SIGNED.md new file mode 100644 index 0000000..3fc4507 --- /dev/null +++ b/SIGNED.md @@ -0,0 +1,101 @@ +##### Signed by https://keybase.io/jdekker +``` +-----BEGIN PGP SIGNATURE----- +Version: GnuPG v1.4.11 (GNU/Linux) + +iQIcBAABAgAGBQJTn3GgAAoJEJrQ9RIUCT6/CI4P/RSAQrd6JugGZoQu/gNdW6eB +MYCybqYGZiieVhUaGOnFNVlp68YpXH+sP/Uc6hXEX30UQEsDmhMeT5NA7ZMS+zJ9 +MNHGQdJq22lGb3+VoVBV4RTMdkQXOXvx6p5biskjIEtM3tfTxP529GvAX2TFUNnt +gGWk28EDr30M95XwDxwWo+57Xv8VtSb3VSvXEbrdwGYf8EoQo9oPtzYQ0YcdupcC +ET8bukYVcwpAjoTnPlEy89TiHHohwmimr2ASXeQ64Ks5wfjzcF7NENCAmaAfR+KI +VLLuGqdWMBx1ewVuAXTCZ0Mga/kBoRUaO0PC13UmL8LhhZY9Z3cwD4UnPU35/RQi +IbLfQcZHf/gEvyMeiTYCsyWpm+/xxn1+EfHol4/Q9VSXzZgRBX05Ik6tqeCvjdgG +4PyHBaJTTm/HfMNdg3mr1mbyjTv5UxglEyPv+Y4NdfoVfepkXsXbzvNSyVffZ3Bw +UaFp7KzIC4Jugdpv63FleiAdDY0+iZ5shH86wD1+HJ0/a87kn5Ao1yESby7J7U+f +poZQYeMFeuC0T5hY/3iYoyvZ68oH918ESESiucSulp5BvfwuqGL2+xo5uJIwGYXE +3IDQC7xbA14JHX86IVJlSHAD33iWyiC+5yjw4/bRRVl37KPsLdHiXH3YIRnF5I2I +ZbM/uDYyJdZbBe4UoCoF +=AMhi +-----END PGP SIGNATURE----- + +``` + + + +### Begin signed statement + +#### Expect + +``` +size exec file contents + ./ +375 .gitignore d2e475a6a4fa51422cac0a07495914e776858fb9ab9c8937a4d491a3e042d6b1 +464 .travis.yml 3063ba078607b8d16bd6467afc15fbbaa4b26c1e30be5ce7cef453cfccbaa95c +428 Changelog.md c7791d1914ddca9ff1549d90468a79787a7feafe94cecd756e3d7cbd4bcbc7df + FourmiCrawler/ +0 __init__.py e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 +304 items.py b00d49a3d53fa13306c7f8b023adb93ab88423c4fce46600689814f6b02bb806 +2178 pipelines.py f9b7b84938060751e15e45de5133dffe50c798bff2a20019206fe7c9d677ad49 +914 settings.py 0be2eaf8e83e85ed27754c896421180fc80cb5ce44449aa9f1048e465d1a96f2 + sources/ +9991 ChemSpider.py 847013e34c5c3683ec66a337837287512b4bab9fbea2ece12e4130ab0dbf264d +9898 NIST.py 97abc84fce85c47b789822715a1945ab84cc052a32340c861141c1af66bab644 +4754 PubChem.py 58ed4c92519e385f2768cf8034b006b18f8a21632cb1c5a0849b1a329a8c6ffb +6907 WikipediaParser.py 5d6de911c773129a34b76c40a9b547aafc67644a15f39cd0be6afc7a16fb0f97 +0 __init__.py e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 +1262 source.py 16c4cdfca849b7dc2bc89d7a6f7ad021f4aa1d04234394312f1d0edf0fd9c5a4 +3026 spider.py 1ffba2512988b7a6b535a4a31a4ef688ece4f8c595c3d50355c34ef46b23e44a +1081 LICENSE 36951e5f1910bad3e008ab7228f35ad8933192e52d3c3ae6a5e875765e27192c +3965 README.md d21236d6a175be28ef8e2fee8a256e95b6a513163e3f1071c26c62e9093db7f3 +3676 x fourmi.py 2ff89f97fd2a49d08417d9ab6cf08e88944d0c45f54ec84550b530be48676c23 +261 scrapy.cfg 624c068fd06303daa65b8e0d0d3ef88ac1f123be2694ef5b4f3f9a9dcd983f85 + tests/ +1 __init__.py 01ba4719c80b6fe911b091a7c05124b64eeece964e09c058ef8f9805daca546b +2837 test_configurator.py 4a0eb6e7121eb09a63ab5cb797570d1a42080c5346c3b8b365da56eefa599e80 +1892 test_pipeline.py 387a336b0f36722a20e712aa033e5771c44f9e92561dd73acffd53d622c52031 +1260 test_sourceloader.py b108b4b80adcdb7401273a9823b1f1a19eb5178776186eb5a9976aed8b1ee869 +2113 test_spider.py 300f280377b522737be0d8e4a80031ab118a4011bdbb92131e9c400fcdab6299 + utils/ +0 __init__.py e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 +3552 configurator.py e2b7e0ee6c1fef4373785dfe5df8ec6950f31ce6a5d9632b69a66ea3d1eaf921 +2537 sourceloader.py f5a5ac2a6aba0658dbe11361f465caabcf3c06c5c8dc9a631874211cc19d2d37 +``` + +#### Ignore + +``` +/SIGNED.md +``` + +#### Presets + +``` +git # ignore .git and anything as described by .gitignore files +dropbox # ignore .dropbox-cache and other Dropbox-related files +kb # ignore anything as described by .kbignore files +``` + + + +### End signed statement + +