Archived
1
0

Merge branch 'develop' into feature/GUI

This commit is contained in:
Harmen Prins 2014-06-17 17:14:36 +02:00
commit 738e1afb36
16 changed files with 331 additions and 76 deletions

12
Changelog.md Normal file
View File

@ -0,0 +1,12 @@
### v0.5.3
- FIX: It is now again possible to use both verbose and the source inclusion/exclusion options
- FIX: Logging is now "actually" disabled if not using the verbose option.
- FEATURE: Added support for PubChem
### v0.5.2
- FIX: Signatured used to contain untracked and older files, current signature
should be correct.
### v0.5.1
- UPDATED: Logging functionality from command line
- DEV: Code cleanup and extra tests

View File

@ -18,8 +18,10 @@ ITEM_PIPELINES = {
FEED_URI = 'results.json'
FEED_FORMAT = 'jsonlines'
# Crawl responsibly by identifying yourself (and your website) on the
# user-agent
# [todo] - Check for repercussions on spoofing the user agent
# USER_AGENT = 'FourmiCrawler (+http://www.yourdomain.com)'
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36'

View File

@ -26,9 +26,8 @@ class ChemSpider(Source):
structure = 'Chemical-Structure.%s.html'
extendedinfo = 'MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token='
def __init__(self, config={}):
def __init__(self, config=None):
Source.__init__(self, config)
self.cfg = config
self.ignore_list = []
if 'token' not in self.cfg or self.cfg['token'] == '':
log.msg('ChemSpider token not set or empty, search/MassSpec API '
@ -37,7 +36,6 @@ class ChemSpider(Source):
self.search += self.cfg['token']
self.extendedinfo += self.cfg['token']
def parse(self, response):
sel = Selector(response)
requests = []
@ -199,7 +197,8 @@ class ChemSpider(Source):
return properties
def newresult(self, attribute, value, conditions='', source='ChemSpider'):
return Result({
return Result(
{
'attribute': attribute,
'value': value,
'source': source,

View File

@ -22,12 +22,9 @@ class NIST(Source):
search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on'
cfg = {}
def __init__(self, config={}):
def __init__(self, config=None):
Source.__init__(self, config)
self.ignore_list = set()
self.cfg = config
def parse(self, response):
sel = Selector(response)
@ -88,7 +85,6 @@ class NIST(Source):
InChiKey, CAS number
"""
ul = sel.xpath('body/ul[li/strong="IUPAC Standard InChI:"]')
li = ul.xpath('li')
raw_synonyms = ul.xpath('li[strong="Other names:"]/text()').extract()
for synonym in raw_synonyms[0].strip().split(';\n'):
@ -255,7 +251,8 @@ class NIST(Source):
return results
def newresult(self, attribute, value, conditions=''):
return Result({
return Result(
{
'attribute': attribute,
'value': value,
'source': 'NIST',

View File

@ -0,0 +1,111 @@
from scrapy.http import Request
from scrapy import log
from source import Source
from scrapy.selector import Selector
from FourmiCrawler.items import Result
import re
class PubChem(Source):
""" PubChem scraper for chemical properties
This parser parses the part on PubChem pages that gives Chemical and Physical properties of a substance,
including sources of the values of properties.
"""
#PubChem has its data on compound name, properties and their values on different html pages, so different URLs used
website = 'https://*.ncbi.nlm.nih.gov/*'
website_www = 'https://www.ncbi.nlm.nih.gov/*'
website_pubchem = 'https://pubchem.ncbi.nlm.nih.gov/*'
search = 'pccompound?term=%s'
data_url = 'toc/summary_toc.cgi?tocid=27&cid=%s'
__spider = None
searched_compounds = set()
def __init__(self, config):
Source.__init__(self, config)
self.cfg = config
def parse(self, response):
"""
Distributes the above described behaviour
:param response: The incoming search request
:return Returns the found properties if response is unique or returns none if it's already known
"""
requests = []
log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG)
sel = Selector(response)
compound = sel.xpath('//h1/text()').extract()[0]
if compound in self.searched_compounds:
return None
self.searched_compounds.update(compound)
raw_synonyms = sel.xpath('//div[@class="smalltext"]/text()').extract()[0]
for synonym in raw_synonyms.strip().split(', '):
log.msg('PubChem synonym found: %s' % synonym, level=log.DEBUG)
self.searched_compounds.update(synonym)
self._spider.get_synonym_requests(synonym)
log.msg('Raw synonyms found: %s' % raw_synonyms, level=log.DEBUG)
n = re.search(r'cid=(\d+)',response.url)
if n:
cid = n.group(1)
log.msg('cid: %s' % cid, level=log.DEBUG) #getting the right id of the compound with which it can reach
# the seperate html page which contains the properties and their values
#using this cid to get the right url and scrape it
requests.append(Request(url=self.website_pubchem[:-1] + self.data_url % cid, callback=self.parse_data))
return requests
def parse_data(self, response):
"""
Parse data found in 'Chemical and Physical properties' part of a substance page.
:param response: The response with the page to parse
:return: requests: Returns a list of properties with their values, source, etc.
"""
log.msg('parsing data', level=log.DEBUG)
requests = []
sel = Selector(response)
props = sel.xpath('//div')
for prop in props:
prop_name = ''.join(prop.xpath('b/text()').extract()) # name of property that it is parsing
if prop.xpath('a'): # parsing for single value in property
prop_source = ''.join(prop.xpath('a/@title').extract())
prop_value = ''.join(prop.xpath('a/text()').extract())
new_prop = Result({
'attribute': prop_name,
'value': prop_value,
'source': prop_source,
'reliability': 'Unknown',
'conditions': ''
})
log.msg('PubChem prop: |%s| |%s| |%s|' %
(new_prop['attribute'], new_prop['value'],
new_prop['source']), level=log.DEBUG)
requests.append(new_prop)
elif prop.xpath('ul'): # parsing for multiple values (list) in property
prop_values = prop.xpath('ul//li')
for prop_li in prop_values:
prop_value = ''.join(prop_li.xpath('a/text()').extract())
prop_source = ''.join(prop_li.xpath('a/@title').extract())
new_prop = Result({
'attribute': prop_name,
'value': prop_value,
'source': prop_source,
'reliability': 'Unknown',
'conditions': ''
})
log.msg('PubChem prop: |%s| |%s| |%s|' %
(new_prop['attribute'], new_prop['value'],
new_prop['source']), level=log.DEBUG)
requests.append(new_prop)
return requests
def new_compound_request(self, compound):
return Request(url=self.website_www[:-1] + self.search % compound, callback=self.parse)

View File

@ -1,9 +1,11 @@
import re
from scrapy.http import Request
from scrapy import log
from source import Source
from scrapy.selector import Selector
from source import Source
from FourmiCrawler.items import Result
import re
class WikipediaParser(Source):
@ -17,11 +19,8 @@ class WikipediaParser(Source):
__spider = None
searched_compounds = []
cfg = {}
def __init__(self, config={}):
def __init__(self, config=None):
Source.__init__(self, config)
self.cfg = config
def parse(self, response):
"""
@ -53,7 +52,7 @@ class WikipediaParser(Source):
# scrape the chembox (wikipedia template)
items = self.parse_chembox(sel, items)
#scrape the drugbox (wikipedia template)
# scrape the drugbox (wikipedia template)
items = self.parse_drugbox(sel, items)
items = filter(lambda a: a['value'] != '', items) # remove items with an empty value
@ -123,7 +122,6 @@ class WikipediaParser(Source):
level=log.DEBUG)
return items
def new_compound_request(self, compound):
return Request(url=self.website[:-1] + compound, callback=self.parse)
@ -161,7 +159,8 @@ class WikipediaParser(Source):
return links
def newresult(self, attribute, value):
return Result({
return Result(
{
'attribute': attribute,
'value': value,
'source': 'Wikipedia',

View File

@ -6,10 +6,13 @@ class Source:
website = "http://something/*" # Regex of URI's the source is able to parse
_spider = None
def __init__(self, config={}):
def __init__(self, config=None):
"""
Initiation of a new Source
"""
self.cfg = {}
if config is not None:
self.cfg = config
pass
def parse(self, response):

View File

@ -10,7 +10,7 @@ class FourmiSpider(Spider):
"""
name = "FourmiSpider"
def __init__(self, compound=None, selected_attributes=[".*"], *args, **kwargs):
def __init__(self, compound=None, selected_attributes=None, *args, **kwargs):
"""
Initiation of the Spider
:param compound: compound that will be searched.
@ -20,6 +20,9 @@ class FourmiSpider(Spider):
self.synonyms = set()
super(FourmiSpider, self).__init__(*args, **kwargs)
self.synonyms.add(compound)
if selected_attributes is None:
self.selected_attributes = [".*"]
else:
self.selected_attributes = selected_attributes
def parse(self, response):

View File

@ -23,21 +23,21 @@ documentation](http://doc.scrapy.org/en/latest/index.html).
### Installing
If you're installing Fourmi, please take a look at our [installation guide](...)
on our wiki. When you've installed the application, make sure to check our
[usage guide](...).
If you're installing Fourmi, please take a look at our installation guides
on our [wiki](https://github.com/jjdekker/Fourmi/wiki). When you've installed the application, make sure to check our
usage guide on the [Command Line Interface](https://github.com/jjdekker/Fourmi/wiki/CLI) and on the [Graphical User Interface](https://github.com/jjdekker/Fourmi/wiki/GUI).
### Using the Source
To use the Fourmi source code multiple dependencies are required. Take a look at
the [wiki page](...) on using the application source code for a step by step
our [wiki pages](https://github.com/jjdekker/Fourmi/wiki) on using the application source code in our a step by step
installation guide.
When developing for the Fourmi project keep in mind that code readability is a
must. To maintain the readability, code should be conform with the
[PEP-8](http://legacy.python.org/dev/peps/pep-0008/) style guide for Python
code. More information about the different structures and principles of the
Fourmi application can be found on our [wiki](...).
Fourmi application can be found on our [wiki](https://github.com/jjdekker/Fourmi/wiki).
### To Do
@ -45,13 +45,9 @@ The Fourmi project has the following goals for the nearby future:
__Main goals:__
- Improve our documentation and guides. (Assignee: Dekker)
- Build an graphical user interface(GUI) as alternative for the command line
interface(CLI). (Assignee: Harmen)
- Compiling the source into an windows executable. (Assignee: Bas)
- Create an configuration file to hold logins and API keys.
- Determine reliability of our data point.
- Create an module to gather data from NIST. (Assignee: Rob)
- Create an module to gather data from PubChem. (Assignee: Nout)
__Side goals:__

101
SIGNED.md Normal file
View File

@ -0,0 +1,101 @@
##### Signed by https://keybase.io/jdekker
```
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.11 (GNU/Linux)
iQIcBAABAgAGBQJTn3GgAAoJEJrQ9RIUCT6/CI4P/RSAQrd6JugGZoQu/gNdW6eB
MYCybqYGZiieVhUaGOnFNVlp68YpXH+sP/Uc6hXEX30UQEsDmhMeT5NA7ZMS+zJ9
MNHGQdJq22lGb3+VoVBV4RTMdkQXOXvx6p5biskjIEtM3tfTxP529GvAX2TFUNnt
gGWk28EDr30M95XwDxwWo+57Xv8VtSb3VSvXEbrdwGYf8EoQo9oPtzYQ0YcdupcC
ET8bukYVcwpAjoTnPlEy89TiHHohwmimr2ASXeQ64Ks5wfjzcF7NENCAmaAfR+KI
VLLuGqdWMBx1ewVuAXTCZ0Mga/kBoRUaO0PC13UmL8LhhZY9Z3cwD4UnPU35/RQi
IbLfQcZHf/gEvyMeiTYCsyWpm+/xxn1+EfHol4/Q9VSXzZgRBX05Ik6tqeCvjdgG
4PyHBaJTTm/HfMNdg3mr1mbyjTv5UxglEyPv+Y4NdfoVfepkXsXbzvNSyVffZ3Bw
UaFp7KzIC4Jugdpv63FleiAdDY0+iZ5shH86wD1+HJ0/a87kn5Ao1yESby7J7U+f
poZQYeMFeuC0T5hY/3iYoyvZ68oH918ESESiucSulp5BvfwuqGL2+xo5uJIwGYXE
3IDQC7xbA14JHX86IVJlSHAD33iWyiC+5yjw4/bRRVl37KPsLdHiXH3YIRnF5I2I
ZbM/uDYyJdZbBe4UoCoF
=AMhi
-----END PGP SIGNATURE-----
```
<!-- END SIGNATURES -->
### Begin signed statement
#### Expect
```
size exec file contents
./
375 .gitignore d2e475a6a4fa51422cac0a07495914e776858fb9ab9c8937a4d491a3e042d6b1
464 .travis.yml 3063ba078607b8d16bd6467afc15fbbaa4b26c1e30be5ce7cef453cfccbaa95c
428 Changelog.md c7791d1914ddca9ff1549d90468a79787a7feafe94cecd756e3d7cbd4bcbc7df
FourmiCrawler/
0 __init__.py e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
304 items.py b00d49a3d53fa13306c7f8b023adb93ab88423c4fce46600689814f6b02bb806
2178 pipelines.py f9b7b84938060751e15e45de5133dffe50c798bff2a20019206fe7c9d677ad49
914 settings.py 0be2eaf8e83e85ed27754c896421180fc80cb5ce44449aa9f1048e465d1a96f2
sources/
9991 ChemSpider.py 847013e34c5c3683ec66a337837287512b4bab9fbea2ece12e4130ab0dbf264d
9898 NIST.py 97abc84fce85c47b789822715a1945ab84cc052a32340c861141c1af66bab644
4754 PubChem.py 58ed4c92519e385f2768cf8034b006b18f8a21632cb1c5a0849b1a329a8c6ffb
6907 WikipediaParser.py 5d6de911c773129a34b76c40a9b547aafc67644a15f39cd0be6afc7a16fb0f97
0 __init__.py e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
1262 source.py 16c4cdfca849b7dc2bc89d7a6f7ad021f4aa1d04234394312f1d0edf0fd9c5a4
3026 spider.py 1ffba2512988b7a6b535a4a31a4ef688ece4f8c595c3d50355c34ef46b23e44a
1081 LICENSE 36951e5f1910bad3e008ab7228f35ad8933192e52d3c3ae6a5e875765e27192c
3965 README.md d21236d6a175be28ef8e2fee8a256e95b6a513163e3f1071c26c62e9093db7f3
3676 x fourmi.py 2ff89f97fd2a49d08417d9ab6cf08e88944d0c45f54ec84550b530be48676c23
261 scrapy.cfg 624c068fd06303daa65b8e0d0d3ef88ac1f123be2694ef5b4f3f9a9dcd983f85
tests/
1 __init__.py 01ba4719c80b6fe911b091a7c05124b64eeece964e09c058ef8f9805daca546b
2837 test_configurator.py 4a0eb6e7121eb09a63ab5cb797570d1a42080c5346c3b8b365da56eefa599e80
1892 test_pipeline.py 387a336b0f36722a20e712aa033e5771c44f9e92561dd73acffd53d622c52031
1260 test_sourceloader.py b108b4b80adcdb7401273a9823b1f1a19eb5178776186eb5a9976aed8b1ee869
2113 test_spider.py 300f280377b522737be0d8e4a80031ab118a4011bdbb92131e9c400fcdab6299
utils/
0 __init__.py e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
3552 configurator.py e2b7e0ee6c1fef4373785dfe5df8ec6950f31ce6a5d9632b69a66ea3d1eaf921
2537 sourceloader.py f5a5ac2a6aba0658dbe11361f465caabcf3c06c5c8dc9a631874211cc19d2d37
```
#### Ignore
```
/SIGNED.md
```
#### Presets
```
git # ignore .git and anything as described by .gitignore files
dropbox # ignore .dropbox-cache and other Dropbox-related files
kb # ignore anything as described by .kbignore files
```
<!-- summarize version = 0.0.9 -->
### End signed statement
<hr>
#### Notes
With keybase you can sign any directory's contents, whether it's a git repo,
source code distribution, or a personal documents folder. It aims to replace the drudgery of:
1. comparing a zipped file to a detached statement
2. downloading a public key
3. confirming it is in fact the author's by reviewing public statements they've made, using it
All in one simple command:
```bash
keybase dir verify
```
There are lots of options, including assertions for automating your checks.
For more info, check out https://keybase.io/docs/command_line/code_signing

View File

@ -5,7 +5,7 @@ Fourmi, a web scraper build to search specific information for a given compound
Usage:
fourmi search <compound>
fourmi [options] search <compound>
fourmi [options] [--include=<sourcename> | --exclude=<sourcename>] search <compound>
fourmi [options] [-v | -vv | -vvv] [--include=<sourcename> | --exclude=<sourcename>] search <compound>
fourmi list
fourmi [--include=<sourcename> | --exclude=<sourcename>] list
fourmi -h | --help
@ -15,7 +15,7 @@ Options:
--attributes=<regex> Include only that match these regular expressions split by a comma. [default: .*]
-h --help Show this screen.
--version Show version.
--verbose Verbose logging output.
-v Verbose logging output. (Multiple occurrences increase logging level)
--log=<file> Save log to an file.
-o <file> --output=<file> Output file [default: results.*format*]
-f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: csv]
@ -25,8 +25,7 @@ Options:
from twisted.internet import reactor
from scrapy.crawler import Crawler
from scrapy import log, signals
from scrapy.utils.project import get_project_settings
from scrapy import signals, log
import docopt
from FourmiCrawler.spider import FourmiSpider
@ -58,15 +57,19 @@ def search(docopt_arguments, source_loader):
:param source_loader: An initiated SourceLoader object pointed at the directory with the sources.
"""
conf = Configurator()
conf.start_log(docopt_arguments["--log"], docopt_arguments["--verbose"])
conf.set_logging(docopt_arguments["--log"], docopt_arguments["-v"])
conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"])
setup_crawler(docopt_arguments["<compound>"], conf.scrapy_settings, source_loader, docopt_arguments["--attributes"].split(','))
setup_crawler(docopt_arguments["<compound>"], conf.scrapy_settings,
source_loader, docopt_arguments["--attributes"].split(','))
if conf.scrapy_settings.getbool("LOG_ENABLED"):
log.start(conf.scrapy_settings.get("LOG_FILE"),
conf.scrapy_settings.get("LOG_LEVEL"), conf.scrapy_settings.get("LOG_STDOUT"))
reactor.run()
# The start for the Fourmi Command Line interface.
if __name__ == '__main__':
arguments = docopt.docopt(__doc__, version='Fourmi - V0.5.0')
arguments = docopt.docopt(__doc__, version='Fourmi - V0.5.3')
loader = SourceLoader()
if arguments["--include"]:

View File

@ -1,7 +1,8 @@
import unittest
import ConfigParser
from utils.configurator import Configurator
import ConfigParser
class TestConfigurator(unittest.TestCase):
@ -21,11 +22,28 @@ class TestConfigurator(unittest.TestCase):
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.csv")
self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv")
# def test_start_log(self):
# self.conf.start_log("test.log", True)
# self.conf.start_log("test.log", False)
# self.conf.start_log(None, True)
# self.conf.start_log(None, False)
def test_start_log(self):
for i in range(0, 3):
self.conf.set_logging("TEST", i)
self.assertEqual(self.conf.scrapy_settings.get("LOG_FILE"), "TEST")
if i > 0:
self.assertEqual(self.conf.scrapy_settings.get("LOG_ENABLED"), True)
if i > 1:
self.assertEqual(self.conf.scrapy_settings.get("LOG_STDOUT"), False)
else:
self.assertEqual(self.conf.scrapy_settings.get("LOG_STDOUT"), True)
else:
self.assertEqual(self.conf.scrapy_settings.get("LOG_ENABLED"), False)
self.assertEqual(self.conf.scrapy_settings.get("LOG_STDOUT"), True)
if i == 1:
self.assertEqual(self.conf.scrapy_settings.get("LOG_LEVEL"), "WARNING")
elif i == 2:
self.assertEqual(self.conf.scrapy_settings.get("LOG_LEVEL"), "INFO")
elif i == 3:
self.assertEqual(self.conf.scrapy_settings.get("LOG_LEVEL"), "DEBUG")
self.conf.set_logging(verbose=i)
self.assertEqual(self.conf.scrapy_settings.get("LOG_FILE"), None)
def test_read_sourceconfiguration(self):
config = self.conf.read_sourceconfiguration()

View File

@ -13,6 +13,7 @@ class TestPipelines(unittest.TestCase):
def test_none_pipeline(self):
# Testing the pipeline that replaces the None values in items.
self.testItem["value"] = "abc"
self.testItem["source"] = None
pipe = pipelines.RemoveNonePipeline()
processed = pipe.process_item(self.testItem, spider.FourmiSpider())

View File

@ -47,7 +47,6 @@ class TestFoumiSpider(unittest.TestCase):
self.assertGreater(len(requests), 0)
self.assertIsInstance(requests[0], Request)
def test_synonym_requests(self):
# A test for the synonym request function
self.spi._sources = []

View File

@ -1,7 +1,8 @@
from scrapy import log
from scrapy.utils.project import get_project_settings
import ConfigParser
from scrapy.utils.project import get_project_settings
class Configurator:
"""
A helper class in the fourmi class. This class is used to process the settings as set
@ -11,7 +12,6 @@ class Configurator:
def __init__(self):
self.scrapy_settings = get_project_settings()
def set_output(self, filename, fileformat):
"""
This function manipulates the Scrapy output file settings that normally would be set in the settings file.
@ -30,23 +30,34 @@ class Configurator:
if fileformat is not None:
self.scrapy_settings.overrides["FEED_FORMAT"] = fileformat
def start_log(self, logfile, verbose):
def set_logging(self, logfile=None, verbose=0):
"""
This function starts the logging functionality of Scrapy using the settings given by the CLI.
This function changes the default settings of Scapy's logging functionality
using the settings given by the CLI.
:param logfile: The location where the logfile will be saved.
:param verbose: A boolean value to switch between loglevels.
:param verbose: A integer value to switch between loglevels.
"""
if verbose != 0:
self.scrapy_settings.overrides["LOG_ENABLED"] = True
else:
self.scrapy_settings.overrides["LOG_ENABLED"] = False
if verbose == 1:
self.scrapy_settings.overrides["LOG_LEVEL"] = "WARNING"
elif verbose == 2:
self.scrapy_settings.overrides["LOG_LEVEL"] = "INFO"
else:
self.scrapy_settings.overrides["LOG_LEVEL"] = "DEBUG"
if verbose > 1:
self.scrapy_settings.overrides["LOG_STDOUT"] = False
else:
self.scrapy_settings.overrides["LOG_STDOUT"] = True
if logfile is not None:
if verbose:
log.start(logfile=logfile, logstdout=False, loglevel=log.DEBUG)
self.scrapy_settings.overrides["LOG_FILE"] = logfile
else:
log.start(logfile=logfile, logstdout=True, loglevel=log.WARNING)
else:
if verbose:
log.start(logstdout=False, loglevel=log.DEBUG)
else:
log.start(logstdout=True, loglevel=log.WARNING)
self.scrapy_settings.overrides["LOG_FILE"] = None
@staticmethod
def read_sourceconfiguration():
@ -75,7 +86,6 @@ class Configurator:
elif config.defaults():
section = config.defaults()
if 'reliability' not in section:
log.msg('Reliability not set for %s' % sourcename,
level=log.WARNING)
print 'WARNING: Reliability not set for %s' % sourcename
section['reliability'] = ''
return section

View File

@ -5,6 +5,7 @@ import re
from FourmiCrawler.sources.source import Source
from utils.configurator import Configurator
class SourceLoader:
sources = []