Archived
1
0

Merge branch 'develop' into feature/GUI

This commit is contained in:
Harmen Prins 2014-06-17 17:14:36 +02:00
commit 738e1afb36
16 changed files with 331 additions and 76 deletions

12
Changelog.md Normal file
View File

@ -0,0 +1,12 @@
### v0.5.3
- FIX: It is now again possible to use both verbose and the source inclusion/exclusion options
- FIX: Logging is now "actually" disabled if not using the verbose option.
- FEATURE: Added support for PubChem
### v0.5.2
- FIX: Signatured used to contain untracked and older files, current signature
should be correct.
### v0.5.1
- UPDATED: Logging functionality from command line
- DEV: Code cleanup and extra tests

View File

@ -18,8 +18,10 @@ ITEM_PIPELINES = {
FEED_URI = 'results.json' FEED_URI = 'results.json'
FEED_FORMAT = 'jsonlines' FEED_FORMAT = 'jsonlines'
# Crawl responsibly by identifying yourself (and your website) on the # Crawl responsibly by identifying yourself (and your website) on the
# user-agent # user-agent
# [todo] - Check for repercussions on spoofing the user agent
# USER_AGENT = 'FourmiCrawler (+http://www.yourdomain.com)' # USER_AGENT = 'FourmiCrawler (+http://www.yourdomain.com)'
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36'

View File

@ -26,9 +26,8 @@ class ChemSpider(Source):
structure = 'Chemical-Structure.%s.html' structure = 'Chemical-Structure.%s.html'
extendedinfo = 'MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token=' extendedinfo = 'MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token='
def __init__(self, config={}): def __init__(self, config=None):
Source.__init__(self, config) Source.__init__(self, config)
self.cfg = config
self.ignore_list = [] self.ignore_list = []
if 'token' not in self.cfg or self.cfg['token'] == '': if 'token' not in self.cfg or self.cfg['token'] == '':
log.msg('ChemSpider token not set or empty, search/MassSpec API ' log.msg('ChemSpider token not set or empty, search/MassSpec API '
@ -37,7 +36,6 @@ class ChemSpider(Source):
self.search += self.cfg['token'] self.search += self.cfg['token']
self.extendedinfo += self.cfg['token'] self.extendedinfo += self.cfg['token']
def parse(self, response): def parse(self, response):
sel = Selector(response) sel = Selector(response)
requests = [] requests = []
@ -199,13 +197,14 @@ class ChemSpider(Source):
return properties return properties
def newresult(self, attribute, value, conditions='', source='ChemSpider'): def newresult(self, attribute, value, conditions='', source='ChemSpider'):
return Result({ return Result(
{
'attribute': attribute, 'attribute': attribute,
'value': value, 'value': value,
'source': source, 'source': source,
'reliability': self.cfg['reliability'], 'reliability': self.cfg['reliability'],
'conditions': conditions 'conditions': conditions
}) })
def parse_searchrequest(self, response): def parse_searchrequest(self, response):
"""Parse the initial response of the ChemSpider Search API """ """Parse the initial response of the ChemSpider Search API """

View File

@ -22,12 +22,9 @@ class NIST(Source):
search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on' search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on'
cfg = {} def __init__(self, config=None):
def __init__(self, config={}):
Source.__init__(self, config) Source.__init__(self, config)
self.ignore_list = set() self.ignore_list = set()
self.cfg = config
def parse(self, response): def parse(self, response):
sel = Selector(response) sel = Selector(response)
@ -88,7 +85,6 @@ class NIST(Source):
InChiKey, CAS number InChiKey, CAS number
""" """
ul = sel.xpath('body/ul[li/strong="IUPAC Standard InChI:"]') ul = sel.xpath('body/ul[li/strong="IUPAC Standard InChI:"]')
li = ul.xpath('li')
raw_synonyms = ul.xpath('li[strong="Other names:"]/text()').extract() raw_synonyms = ul.xpath('li[strong="Other names:"]/text()').extract()
for synonym in raw_synonyms[0].strip().split(';\n'): for synonym in raw_synonyms[0].strip().split(';\n'):
@ -255,12 +251,13 @@ class NIST(Source):
return results return results
def newresult(self, attribute, value, conditions=''): def newresult(self, attribute, value, conditions=''):
return Result({ return Result(
'attribute': attribute, {
'value': value, 'attribute': attribute,
'source': 'NIST', 'value': value,
'reliability': self.cfg['reliability'], 'source': 'NIST',
'conditions': conditions 'reliability': self.cfg['reliability'],
'conditions': conditions
}) })
def new_compound_request(self, compound): def new_compound_request(self, compound):

View File

@ -0,0 +1,111 @@
from scrapy.http import Request
from scrapy import log
from source import Source
from scrapy.selector import Selector
from FourmiCrawler.items import Result
import re
class PubChem(Source):
""" PubChem scraper for chemical properties
This parser parses the part on PubChem pages that gives Chemical and Physical properties of a substance,
including sources of the values of properties.
"""
#PubChem has its data on compound name, properties and their values on different html pages, so different URLs used
website = 'https://*.ncbi.nlm.nih.gov/*'
website_www = 'https://www.ncbi.nlm.nih.gov/*'
website_pubchem = 'https://pubchem.ncbi.nlm.nih.gov/*'
search = 'pccompound?term=%s'
data_url = 'toc/summary_toc.cgi?tocid=27&cid=%s'
__spider = None
searched_compounds = set()
def __init__(self, config):
Source.__init__(self, config)
self.cfg = config
def parse(self, response):
"""
Distributes the above described behaviour
:param response: The incoming search request
:return Returns the found properties if response is unique or returns none if it's already known
"""
requests = []
log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG)
sel = Selector(response)
compound = sel.xpath('//h1/text()').extract()[0]
if compound in self.searched_compounds:
return None
self.searched_compounds.update(compound)
raw_synonyms = sel.xpath('//div[@class="smalltext"]/text()').extract()[0]
for synonym in raw_synonyms.strip().split(', '):
log.msg('PubChem synonym found: %s' % synonym, level=log.DEBUG)
self.searched_compounds.update(synonym)
self._spider.get_synonym_requests(synonym)
log.msg('Raw synonyms found: %s' % raw_synonyms, level=log.DEBUG)
n = re.search(r'cid=(\d+)',response.url)
if n:
cid = n.group(1)
log.msg('cid: %s' % cid, level=log.DEBUG) #getting the right id of the compound with which it can reach
# the seperate html page which contains the properties and their values
#using this cid to get the right url and scrape it
requests.append(Request(url=self.website_pubchem[:-1] + self.data_url % cid, callback=self.parse_data))
return requests
def parse_data(self, response):
"""
Parse data found in 'Chemical and Physical properties' part of a substance page.
:param response: The response with the page to parse
:return: requests: Returns a list of properties with their values, source, etc.
"""
log.msg('parsing data', level=log.DEBUG)
requests = []
sel = Selector(response)
props = sel.xpath('//div')
for prop in props:
prop_name = ''.join(prop.xpath('b/text()').extract()) # name of property that it is parsing
if prop.xpath('a'): # parsing for single value in property
prop_source = ''.join(prop.xpath('a/@title').extract())
prop_value = ''.join(prop.xpath('a/text()').extract())
new_prop = Result({
'attribute': prop_name,
'value': prop_value,
'source': prop_source,
'reliability': 'Unknown',
'conditions': ''
})
log.msg('PubChem prop: |%s| |%s| |%s|' %
(new_prop['attribute'], new_prop['value'],
new_prop['source']), level=log.DEBUG)
requests.append(new_prop)
elif prop.xpath('ul'): # parsing for multiple values (list) in property
prop_values = prop.xpath('ul//li')
for prop_li in prop_values:
prop_value = ''.join(prop_li.xpath('a/text()').extract())
prop_source = ''.join(prop_li.xpath('a/@title').extract())
new_prop = Result({
'attribute': prop_name,
'value': prop_value,
'source': prop_source,
'reliability': 'Unknown',
'conditions': ''
})
log.msg('PubChem prop: |%s| |%s| |%s|' %
(new_prop['attribute'], new_prop['value'],
new_prop['source']), level=log.DEBUG)
requests.append(new_prop)
return requests
def new_compound_request(self, compound):
return Request(url=self.website_www[:-1] + self.search % compound, callback=self.parse)

View File

@ -1,9 +1,11 @@
import re
from scrapy.http import Request from scrapy.http import Request
from scrapy import log from scrapy import log
from source import Source
from scrapy.selector import Selector from scrapy.selector import Selector
from source import Source
from FourmiCrawler.items import Result from FourmiCrawler.items import Result
import re
class WikipediaParser(Source): class WikipediaParser(Source):
@ -17,11 +19,8 @@ class WikipediaParser(Source):
__spider = None __spider = None
searched_compounds = [] searched_compounds = []
cfg = {} def __init__(self, config=None):
def __init__(self, config={}):
Source.__init__(self, config) Source.__init__(self, config)
self.cfg = config
def parse(self, response): def parse(self, response):
""" """
@ -53,7 +52,7 @@ class WikipediaParser(Source):
# scrape the chembox (wikipedia template) # scrape the chembox (wikipedia template)
items = self.parse_chembox(sel, items) items = self.parse_chembox(sel, items)
#scrape the drugbox (wikipedia template) # scrape the drugbox (wikipedia template)
items = self.parse_drugbox(sel, items) items = self.parse_drugbox(sel, items)
items = filter(lambda a: a['value'] != '', items) # remove items with an empty value items = filter(lambda a: a['value'] != '', items) # remove items with an empty value
@ -123,7 +122,6 @@ class WikipediaParser(Source):
level=log.DEBUG) level=log.DEBUG)
return items return items
def new_compound_request(self, compound): def new_compound_request(self, compound):
return Request(url=self.website[:-1] + compound, callback=self.parse) return Request(url=self.website[:-1] + compound, callback=self.parse)
@ -161,10 +159,11 @@ class WikipediaParser(Source):
return links return links
def newresult(self, attribute, value): def newresult(self, attribute, value):
return Result({ return Result(
'attribute': attribute, {
'value': value, 'attribute': attribute,
'source': 'Wikipedia', 'value': value,
'reliability': self.cfg['reliability'], 'source': 'Wikipedia',
'conditions': '' 'reliability': self.cfg['reliability'],
'conditions': ''
}) })

View File

@ -6,10 +6,13 @@ class Source:
website = "http://something/*" # Regex of URI's the source is able to parse website = "http://something/*" # Regex of URI's the source is able to parse
_spider = None _spider = None
def __init__(self, config={}): def __init__(self, config=None):
""" """
Initiation of a new Source Initiation of a new Source
""" """
self.cfg = {}
if config is not None:
self.cfg = config
pass pass
def parse(self, response): def parse(self, response):

View File

@ -10,7 +10,7 @@ class FourmiSpider(Spider):
""" """
name = "FourmiSpider" name = "FourmiSpider"
def __init__(self, compound=None, selected_attributes=[".*"], *args, **kwargs): def __init__(self, compound=None, selected_attributes=None, *args, **kwargs):
""" """
Initiation of the Spider Initiation of the Spider
:param compound: compound that will be searched. :param compound: compound that will be searched.
@ -20,7 +20,10 @@ class FourmiSpider(Spider):
self.synonyms = set() self.synonyms = set()
super(FourmiSpider, self).__init__(*args, **kwargs) super(FourmiSpider, self).__init__(*args, **kwargs)
self.synonyms.add(compound) self.synonyms.add(compound)
self.selected_attributes = selected_attributes if selected_attributes is None:
self.selected_attributes = [".*"]
else:
self.selected_attributes = selected_attributes
def parse(self, response): def parse(self, response):
""" """

View File

@ -23,21 +23,21 @@ documentation](http://doc.scrapy.org/en/latest/index.html).
### Installing ### Installing
If you're installing Fourmi, please take a look at our [installation guide](...) If you're installing Fourmi, please take a look at our installation guides
on our wiki. When you've installed the application, make sure to check our on our [wiki](https://github.com/jjdekker/Fourmi/wiki). When you've installed the application, make sure to check our
[usage guide](...). usage guide on the [Command Line Interface](https://github.com/jjdekker/Fourmi/wiki/CLI) and on the [Graphical User Interface](https://github.com/jjdekker/Fourmi/wiki/GUI).
### Using the Source ### Using the Source
To use the Fourmi source code multiple dependencies are required. Take a look at To use the Fourmi source code multiple dependencies are required. Take a look at
the [wiki page](...) on using the application source code for a step by step our [wiki pages](https://github.com/jjdekker/Fourmi/wiki) on using the application source code in our a step by step
installation guide. installation guide.
When developing for the Fourmi project keep in mind that code readability is a When developing for the Fourmi project keep in mind that code readability is a
must. To maintain the readability, code should be conform with the must. To maintain the readability, code should be conform with the
[PEP-8](http://legacy.python.org/dev/peps/pep-0008/) style guide for Python [PEP-8](http://legacy.python.org/dev/peps/pep-0008/) style guide for Python
code. More information about the different structures and principles of the code. More information about the different structures and principles of the
Fourmi application can be found on our [wiki](...). Fourmi application can be found on our [wiki](https://github.com/jjdekker/Fourmi/wiki).
### To Do ### To Do
@ -45,13 +45,9 @@ The Fourmi project has the following goals for the nearby future:
__Main goals:__ __Main goals:__
- Improve our documentation and guides. (Assignee: Dekker)
- Build an graphical user interface(GUI) as alternative for the command line - Build an graphical user interface(GUI) as alternative for the command line
interface(CLI). (Assignee: Harmen) interface(CLI). (Assignee: Harmen)
- Compiling the source into an windows executable. (Assignee: Bas) - Compiling the source into an windows executable. (Assignee: Bas)
- Create an configuration file to hold logins and API keys.
- Determine reliability of our data point.
- Create an module to gather data from NIST. (Assignee: Rob)
- Create an module to gather data from PubChem. (Assignee: Nout) - Create an module to gather data from PubChem. (Assignee: Nout)
__Side goals:__ __Side goals:__

101
SIGNED.md Normal file
View File

@ -0,0 +1,101 @@
##### Signed by https://keybase.io/jdekker
```
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.11 (GNU/Linux)
iQIcBAABAgAGBQJTn3GgAAoJEJrQ9RIUCT6/CI4P/RSAQrd6JugGZoQu/gNdW6eB
MYCybqYGZiieVhUaGOnFNVlp68YpXH+sP/Uc6hXEX30UQEsDmhMeT5NA7ZMS+zJ9
MNHGQdJq22lGb3+VoVBV4RTMdkQXOXvx6p5biskjIEtM3tfTxP529GvAX2TFUNnt
gGWk28EDr30M95XwDxwWo+57Xv8VtSb3VSvXEbrdwGYf8EoQo9oPtzYQ0YcdupcC
ET8bukYVcwpAjoTnPlEy89TiHHohwmimr2ASXeQ64Ks5wfjzcF7NENCAmaAfR+KI
VLLuGqdWMBx1ewVuAXTCZ0Mga/kBoRUaO0PC13UmL8LhhZY9Z3cwD4UnPU35/RQi
IbLfQcZHf/gEvyMeiTYCsyWpm+/xxn1+EfHol4/Q9VSXzZgRBX05Ik6tqeCvjdgG
4PyHBaJTTm/HfMNdg3mr1mbyjTv5UxglEyPv+Y4NdfoVfepkXsXbzvNSyVffZ3Bw
UaFp7KzIC4Jugdpv63FleiAdDY0+iZ5shH86wD1+HJ0/a87kn5Ao1yESby7J7U+f
poZQYeMFeuC0T5hY/3iYoyvZ68oH918ESESiucSulp5BvfwuqGL2+xo5uJIwGYXE
3IDQC7xbA14JHX86IVJlSHAD33iWyiC+5yjw4/bRRVl37KPsLdHiXH3YIRnF5I2I
ZbM/uDYyJdZbBe4UoCoF
=AMhi
-----END PGP SIGNATURE-----
```
<!-- END SIGNATURES -->
### Begin signed statement
#### Expect
```
size exec file contents
./
375 .gitignore d2e475a6a4fa51422cac0a07495914e776858fb9ab9c8937a4d491a3e042d6b1
464 .travis.yml 3063ba078607b8d16bd6467afc15fbbaa4b26c1e30be5ce7cef453cfccbaa95c
428 Changelog.md c7791d1914ddca9ff1549d90468a79787a7feafe94cecd756e3d7cbd4bcbc7df
FourmiCrawler/
0 __init__.py e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
304 items.py b00d49a3d53fa13306c7f8b023adb93ab88423c4fce46600689814f6b02bb806
2178 pipelines.py f9b7b84938060751e15e45de5133dffe50c798bff2a20019206fe7c9d677ad49
914 settings.py 0be2eaf8e83e85ed27754c896421180fc80cb5ce44449aa9f1048e465d1a96f2
sources/
9991 ChemSpider.py 847013e34c5c3683ec66a337837287512b4bab9fbea2ece12e4130ab0dbf264d
9898 NIST.py 97abc84fce85c47b789822715a1945ab84cc052a32340c861141c1af66bab644
4754 PubChem.py 58ed4c92519e385f2768cf8034b006b18f8a21632cb1c5a0849b1a329a8c6ffb
6907 WikipediaParser.py 5d6de911c773129a34b76c40a9b547aafc67644a15f39cd0be6afc7a16fb0f97
0 __init__.py e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
1262 source.py 16c4cdfca849b7dc2bc89d7a6f7ad021f4aa1d04234394312f1d0edf0fd9c5a4
3026 spider.py 1ffba2512988b7a6b535a4a31a4ef688ece4f8c595c3d50355c34ef46b23e44a
1081 LICENSE 36951e5f1910bad3e008ab7228f35ad8933192e52d3c3ae6a5e875765e27192c
3965 README.md d21236d6a175be28ef8e2fee8a256e95b6a513163e3f1071c26c62e9093db7f3
3676 x fourmi.py 2ff89f97fd2a49d08417d9ab6cf08e88944d0c45f54ec84550b530be48676c23
261 scrapy.cfg 624c068fd06303daa65b8e0d0d3ef88ac1f123be2694ef5b4f3f9a9dcd983f85
tests/
1 __init__.py 01ba4719c80b6fe911b091a7c05124b64eeece964e09c058ef8f9805daca546b
2837 test_configurator.py 4a0eb6e7121eb09a63ab5cb797570d1a42080c5346c3b8b365da56eefa599e80
1892 test_pipeline.py 387a336b0f36722a20e712aa033e5771c44f9e92561dd73acffd53d622c52031
1260 test_sourceloader.py b108b4b80adcdb7401273a9823b1f1a19eb5178776186eb5a9976aed8b1ee869
2113 test_spider.py 300f280377b522737be0d8e4a80031ab118a4011bdbb92131e9c400fcdab6299
utils/
0 __init__.py e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
3552 configurator.py e2b7e0ee6c1fef4373785dfe5df8ec6950f31ce6a5d9632b69a66ea3d1eaf921
2537 sourceloader.py f5a5ac2a6aba0658dbe11361f465caabcf3c06c5c8dc9a631874211cc19d2d37
```
#### Ignore
```
/SIGNED.md
```
#### Presets
```
git # ignore .git and anything as described by .gitignore files
dropbox # ignore .dropbox-cache and other Dropbox-related files
kb # ignore anything as described by .kbignore files
```
<!-- summarize version = 0.0.9 -->
### End signed statement
<hr>
#### Notes
With keybase you can sign any directory's contents, whether it's a git repo,
source code distribution, or a personal documents folder. It aims to replace the drudgery of:
1. comparing a zipped file to a detached statement
2. downloading a public key
3. confirming it is in fact the author's by reviewing public statements they've made, using it
All in one simple command:
```bash
keybase dir verify
```
There are lots of options, including assertions for automating your checks.
For more info, check out https://keybase.io/docs/command_line/code_signing

View File

@ -5,7 +5,7 @@ Fourmi, a web scraper build to search specific information for a given compound
Usage: Usage:
fourmi search <compound> fourmi search <compound>
fourmi [options] search <compound> fourmi [options] search <compound>
fourmi [options] [--include=<sourcename> | --exclude=<sourcename>] search <compound> fourmi [options] [-v | -vv | -vvv] [--include=<sourcename> | --exclude=<sourcename>] search <compound>
fourmi list fourmi list
fourmi [--include=<sourcename> | --exclude=<sourcename>] list fourmi [--include=<sourcename> | --exclude=<sourcename>] list
fourmi -h | --help fourmi -h | --help
@ -15,7 +15,7 @@ Options:
--attributes=<regex> Include only that match these regular expressions split by a comma. [default: .*] --attributes=<regex> Include only that match these regular expressions split by a comma. [default: .*]
-h --help Show this screen. -h --help Show this screen.
--version Show version. --version Show version.
--verbose Verbose logging output. -v Verbose logging output. (Multiple occurrences increase logging level)
--log=<file> Save log to an file. --log=<file> Save log to an file.
-o <file> --output=<file> Output file [default: results.*format*] -o <file> --output=<file> Output file [default: results.*format*]
-f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: csv] -f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: csv]
@ -25,8 +25,7 @@ Options:
from twisted.internet import reactor from twisted.internet import reactor
from scrapy.crawler import Crawler from scrapy.crawler import Crawler
from scrapy import log, signals from scrapy import signals, log
from scrapy.utils.project import get_project_settings
import docopt import docopt
from FourmiCrawler.spider import FourmiSpider from FourmiCrawler.spider import FourmiSpider
@ -58,15 +57,19 @@ def search(docopt_arguments, source_loader):
:param source_loader: An initiated SourceLoader object pointed at the directory with the sources. :param source_loader: An initiated SourceLoader object pointed at the directory with the sources.
""" """
conf = Configurator() conf = Configurator()
conf.start_log(docopt_arguments["--log"], docopt_arguments["--verbose"]) conf.set_logging(docopt_arguments["--log"], docopt_arguments["-v"])
conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"]) conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"])
setup_crawler(docopt_arguments["<compound>"], conf.scrapy_settings, source_loader, docopt_arguments["--attributes"].split(',')) setup_crawler(docopt_arguments["<compound>"], conf.scrapy_settings,
source_loader, docopt_arguments["--attributes"].split(','))
if conf.scrapy_settings.getbool("LOG_ENABLED"):
log.start(conf.scrapy_settings.get("LOG_FILE"),
conf.scrapy_settings.get("LOG_LEVEL"), conf.scrapy_settings.get("LOG_STDOUT"))
reactor.run() reactor.run()
# The start for the Fourmi Command Line interface. # The start for the Fourmi Command Line interface.
if __name__ == '__main__': if __name__ == '__main__':
arguments = docopt.docopt(__doc__, version='Fourmi - V0.5.0') arguments = docopt.docopt(__doc__, version='Fourmi - V0.5.3')
loader = SourceLoader() loader = SourceLoader()
if arguments["--include"]: if arguments["--include"]:

View File

@ -1,7 +1,8 @@
import unittest import unittest
import ConfigParser
from utils.configurator import Configurator from utils.configurator import Configurator
import ConfigParser
class TestConfigurator(unittest.TestCase): class TestConfigurator(unittest.TestCase):
@ -21,11 +22,28 @@ class TestConfigurator(unittest.TestCase):
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.csv") self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.csv")
self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv") self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv")
# def test_start_log(self): def test_start_log(self):
# self.conf.start_log("test.log", True) for i in range(0, 3):
# self.conf.start_log("test.log", False) self.conf.set_logging("TEST", i)
# self.conf.start_log(None, True) self.assertEqual(self.conf.scrapy_settings.get("LOG_FILE"), "TEST")
# self.conf.start_log(None, False) if i > 0:
self.assertEqual(self.conf.scrapy_settings.get("LOG_ENABLED"), True)
if i > 1:
self.assertEqual(self.conf.scrapy_settings.get("LOG_STDOUT"), False)
else:
self.assertEqual(self.conf.scrapy_settings.get("LOG_STDOUT"), True)
else:
self.assertEqual(self.conf.scrapy_settings.get("LOG_ENABLED"), False)
self.assertEqual(self.conf.scrapy_settings.get("LOG_STDOUT"), True)
if i == 1:
self.assertEqual(self.conf.scrapy_settings.get("LOG_LEVEL"), "WARNING")
elif i == 2:
self.assertEqual(self.conf.scrapy_settings.get("LOG_LEVEL"), "INFO")
elif i == 3:
self.assertEqual(self.conf.scrapy_settings.get("LOG_LEVEL"), "DEBUG")
self.conf.set_logging(verbose=i)
self.assertEqual(self.conf.scrapy_settings.get("LOG_FILE"), None)
def test_read_sourceconfiguration(self): def test_read_sourceconfiguration(self):
config = self.conf.read_sourceconfiguration() config = self.conf.read_sourceconfiguration()

View File

@ -13,6 +13,7 @@ class TestPipelines(unittest.TestCase):
def test_none_pipeline(self): def test_none_pipeline(self):
# Testing the pipeline that replaces the None values in items. # Testing the pipeline that replaces the None values in items.
self.testItem["value"] = "abc" self.testItem["value"] = "abc"
self.testItem["source"] = None
pipe = pipelines.RemoveNonePipeline() pipe = pipelines.RemoveNonePipeline()
processed = pipe.process_item(self.testItem, spider.FourmiSpider()) processed = pipe.process_item(self.testItem, spider.FourmiSpider())

View File

@ -47,7 +47,6 @@ class TestFoumiSpider(unittest.TestCase):
self.assertGreater(len(requests), 0) self.assertGreater(len(requests), 0)
self.assertIsInstance(requests[0], Request) self.assertIsInstance(requests[0], Request)
def test_synonym_requests(self): def test_synonym_requests(self):
# A test for the synonym request function # A test for the synonym request function
self.spi._sources = [] self.spi._sources = []

View File

@ -1,7 +1,8 @@
from scrapy import log
from scrapy.utils.project import get_project_settings
import ConfigParser import ConfigParser
from scrapy.utils.project import get_project_settings
class Configurator: class Configurator:
""" """
A helper class in the fourmi class. This class is used to process the settings as set A helper class in the fourmi class. This class is used to process the settings as set
@ -11,7 +12,6 @@ class Configurator:
def __init__(self): def __init__(self):
self.scrapy_settings = get_project_settings() self.scrapy_settings = get_project_settings()
def set_output(self, filename, fileformat): def set_output(self, filename, fileformat):
""" """
This function manipulates the Scrapy output file settings that normally would be set in the settings file. This function manipulates the Scrapy output file settings that normally would be set in the settings file.
@ -30,23 +30,34 @@ class Configurator:
if fileformat is not None: if fileformat is not None:
self.scrapy_settings.overrides["FEED_FORMAT"] = fileformat self.scrapy_settings.overrides["FEED_FORMAT"] = fileformat
def set_logging(self, logfile=None, verbose=0):
def start_log(self, logfile, verbose):
""" """
This function starts the logging functionality of Scrapy using the settings given by the CLI. This function changes the default settings of Scapy's logging functionality
using the settings given by the CLI.
:param logfile: The location where the logfile will be saved. :param logfile: The location where the logfile will be saved.
:param verbose: A boolean value to switch between loglevels. :param verbose: A integer value to switch between loglevels.
""" """
if logfile is not None: if verbose != 0:
if verbose: self.scrapy_settings.overrides["LOG_ENABLED"] = True
log.start(logfile=logfile, logstdout=False, loglevel=log.DEBUG)
else:
log.start(logfile=logfile, logstdout=True, loglevel=log.WARNING)
else: else:
if verbose: self.scrapy_settings.overrides["LOG_ENABLED"] = False
log.start(logstdout=False, loglevel=log.DEBUG)
else: if verbose == 1:
log.start(logstdout=True, loglevel=log.WARNING) self.scrapy_settings.overrides["LOG_LEVEL"] = "WARNING"
elif verbose == 2:
self.scrapy_settings.overrides["LOG_LEVEL"] = "INFO"
else:
self.scrapy_settings.overrides["LOG_LEVEL"] = "DEBUG"
if verbose > 1:
self.scrapy_settings.overrides["LOG_STDOUT"] = False
else:
self.scrapy_settings.overrides["LOG_STDOUT"] = True
if logfile is not None:
self.scrapy_settings.overrides["LOG_FILE"] = logfile
else:
self.scrapy_settings.overrides["LOG_FILE"] = None
@staticmethod @staticmethod
def read_sourceconfiguration(): def read_sourceconfiguration():
@ -56,7 +67,7 @@ class Configurator:
:return a ConfigParser object of sources.cfg :return a ConfigParser object of sources.cfg
""" """
config = ConfigParser.ConfigParser() config = ConfigParser.ConfigParser()
config.read('sources.cfg') # [TODO]: should be softcoded eventually config.read('sources.cfg') # [TODO]: should be softcoded eventually
return config return config
@staticmethod @staticmethod
@ -75,7 +86,6 @@ class Configurator:
elif config.defaults(): elif config.defaults():
section = config.defaults() section = config.defaults()
if 'reliability' not in section: if 'reliability' not in section:
log.msg('Reliability not set for %s' % sourcename, print 'WARNING: Reliability not set for %s' % sourcename
level=log.WARNING)
section['reliability'] = '' section['reliability'] = ''
return section return section

View File

@ -5,6 +5,7 @@ import re
from FourmiCrawler.sources.source import Source from FourmiCrawler.sources.source import Source
from utils.configurator import Configurator from utils.configurator import Configurator
class SourceLoader: class SourceLoader:
sources = [] sources = []