Merge branch 'develop' into feature/GUI
This commit is contained in:
commit
738e1afb36
12
Changelog.md
Normal file
12
Changelog.md
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
### v0.5.3
|
||||||
|
- FIX: It is now again possible to use both verbose and the source inclusion/exclusion options
|
||||||
|
- FIX: Logging is now "actually" disabled if not using the verbose option.
|
||||||
|
- FEATURE: Added support for PubChem
|
||||||
|
|
||||||
|
### v0.5.2
|
||||||
|
- FIX: Signatured used to contain untracked and older files, current signature
|
||||||
|
should be correct.
|
||||||
|
|
||||||
|
### v0.5.1
|
||||||
|
- UPDATED: Logging functionality from command line
|
||||||
|
- DEV: Code cleanup and extra tests
|
@ -18,8 +18,10 @@ ITEM_PIPELINES = {
|
|||||||
FEED_URI = 'results.json'
|
FEED_URI = 'results.json'
|
||||||
FEED_FORMAT = 'jsonlines'
|
FEED_FORMAT = 'jsonlines'
|
||||||
|
|
||||||
|
|
||||||
# Crawl responsibly by identifying yourself (and your website) on the
|
# Crawl responsibly by identifying yourself (and your website) on the
|
||||||
# user-agent
|
# user-agent
|
||||||
|
|
||||||
|
# [todo] - Check for repercussions on spoofing the user agent
|
||||||
|
|
||||||
# USER_AGENT = 'FourmiCrawler (+http://www.yourdomain.com)'
|
# USER_AGENT = 'FourmiCrawler (+http://www.yourdomain.com)'
|
||||||
|
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36'
|
||||||
|
@ -26,9 +26,8 @@ class ChemSpider(Source):
|
|||||||
structure = 'Chemical-Structure.%s.html'
|
structure = 'Chemical-Structure.%s.html'
|
||||||
extendedinfo = 'MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token='
|
extendedinfo = 'MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token='
|
||||||
|
|
||||||
def __init__(self, config={}):
|
def __init__(self, config=None):
|
||||||
Source.__init__(self, config)
|
Source.__init__(self, config)
|
||||||
self.cfg = config
|
|
||||||
self.ignore_list = []
|
self.ignore_list = []
|
||||||
if 'token' not in self.cfg or self.cfg['token'] == '':
|
if 'token' not in self.cfg or self.cfg['token'] == '':
|
||||||
log.msg('ChemSpider token not set or empty, search/MassSpec API '
|
log.msg('ChemSpider token not set or empty, search/MassSpec API '
|
||||||
@ -37,7 +36,6 @@ class ChemSpider(Source):
|
|||||||
self.search += self.cfg['token']
|
self.search += self.cfg['token']
|
||||||
self.extendedinfo += self.cfg['token']
|
self.extendedinfo += self.cfg['token']
|
||||||
|
|
||||||
|
|
||||||
def parse(self, response):
|
def parse(self, response):
|
||||||
sel = Selector(response)
|
sel = Selector(response)
|
||||||
requests = []
|
requests = []
|
||||||
@ -199,7 +197,8 @@ class ChemSpider(Source):
|
|||||||
return properties
|
return properties
|
||||||
|
|
||||||
def newresult(self, attribute, value, conditions='', source='ChemSpider'):
|
def newresult(self, attribute, value, conditions='', source='ChemSpider'):
|
||||||
return Result({
|
return Result(
|
||||||
|
{
|
||||||
'attribute': attribute,
|
'attribute': attribute,
|
||||||
'value': value,
|
'value': value,
|
||||||
'source': source,
|
'source': source,
|
||||||
|
@ -22,12 +22,9 @@ class NIST(Source):
|
|||||||
|
|
||||||
search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on'
|
search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on'
|
||||||
|
|
||||||
cfg = {}
|
def __init__(self, config=None):
|
||||||
|
|
||||||
def __init__(self, config={}):
|
|
||||||
Source.__init__(self, config)
|
Source.__init__(self, config)
|
||||||
self.ignore_list = set()
|
self.ignore_list = set()
|
||||||
self.cfg = config
|
|
||||||
|
|
||||||
def parse(self, response):
|
def parse(self, response):
|
||||||
sel = Selector(response)
|
sel = Selector(response)
|
||||||
@ -88,7 +85,6 @@ class NIST(Source):
|
|||||||
InChiKey, CAS number
|
InChiKey, CAS number
|
||||||
"""
|
"""
|
||||||
ul = sel.xpath('body/ul[li/strong="IUPAC Standard InChI:"]')
|
ul = sel.xpath('body/ul[li/strong="IUPAC Standard InChI:"]')
|
||||||
li = ul.xpath('li')
|
|
||||||
|
|
||||||
raw_synonyms = ul.xpath('li[strong="Other names:"]/text()').extract()
|
raw_synonyms = ul.xpath('li[strong="Other names:"]/text()').extract()
|
||||||
for synonym in raw_synonyms[0].strip().split(';\n'):
|
for synonym in raw_synonyms[0].strip().split(';\n'):
|
||||||
@ -255,7 +251,8 @@ class NIST(Source):
|
|||||||
return results
|
return results
|
||||||
|
|
||||||
def newresult(self, attribute, value, conditions=''):
|
def newresult(self, attribute, value, conditions=''):
|
||||||
return Result({
|
return Result(
|
||||||
|
{
|
||||||
'attribute': attribute,
|
'attribute': attribute,
|
||||||
'value': value,
|
'value': value,
|
||||||
'source': 'NIST',
|
'source': 'NIST',
|
||||||
|
111
FourmiCrawler/sources/PubChem.py
Normal file
111
FourmiCrawler/sources/PubChem.py
Normal file
@ -0,0 +1,111 @@
|
|||||||
|
from scrapy.http import Request
|
||||||
|
from scrapy import log
|
||||||
|
from source import Source
|
||||||
|
from scrapy.selector import Selector
|
||||||
|
from FourmiCrawler.items import Result
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
class PubChem(Source):
|
||||||
|
""" PubChem scraper for chemical properties
|
||||||
|
|
||||||
|
This parser parses the part on PubChem pages that gives Chemical and Physical properties of a substance,
|
||||||
|
including sources of the values of properties.
|
||||||
|
"""
|
||||||
|
|
||||||
|
#PubChem has its data on compound name, properties and their values on different html pages, so different URLs used
|
||||||
|
website = 'https://*.ncbi.nlm.nih.gov/*'
|
||||||
|
website_www = 'https://www.ncbi.nlm.nih.gov/*'
|
||||||
|
website_pubchem = 'https://pubchem.ncbi.nlm.nih.gov/*'
|
||||||
|
search = 'pccompound?term=%s'
|
||||||
|
data_url = 'toc/summary_toc.cgi?tocid=27&cid=%s'
|
||||||
|
|
||||||
|
__spider = None
|
||||||
|
searched_compounds = set()
|
||||||
|
|
||||||
|
def __init__(self, config):
|
||||||
|
Source.__init__(self, config)
|
||||||
|
self.cfg = config
|
||||||
|
|
||||||
|
def parse(self, response):
|
||||||
|
"""
|
||||||
|
Distributes the above described behaviour
|
||||||
|
:param response: The incoming search request
|
||||||
|
:return Returns the found properties if response is unique or returns none if it's already known
|
||||||
|
"""
|
||||||
|
requests = []
|
||||||
|
log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG)
|
||||||
|
|
||||||
|
sel = Selector(response)
|
||||||
|
compound = sel.xpath('//h1/text()').extract()[0]
|
||||||
|
if compound in self.searched_compounds:
|
||||||
|
return None
|
||||||
|
|
||||||
|
self.searched_compounds.update(compound)
|
||||||
|
raw_synonyms = sel.xpath('//div[@class="smalltext"]/text()').extract()[0]
|
||||||
|
for synonym in raw_synonyms.strip().split(', '):
|
||||||
|
log.msg('PubChem synonym found: %s' % synonym, level=log.DEBUG)
|
||||||
|
self.searched_compounds.update(synonym)
|
||||||
|
self._spider.get_synonym_requests(synonym)
|
||||||
|
log.msg('Raw synonyms found: %s' % raw_synonyms, level=log.DEBUG)
|
||||||
|
|
||||||
|
n = re.search(r'cid=(\d+)',response.url)
|
||||||
|
if n:
|
||||||
|
cid = n.group(1)
|
||||||
|
log.msg('cid: %s' % cid, level=log.DEBUG) #getting the right id of the compound with which it can reach
|
||||||
|
# the seperate html page which contains the properties and their values
|
||||||
|
|
||||||
|
#using this cid to get the right url and scrape it
|
||||||
|
requests.append(Request(url=self.website_pubchem[:-1] + self.data_url % cid, callback=self.parse_data))
|
||||||
|
return requests
|
||||||
|
|
||||||
|
def parse_data(self, response):
|
||||||
|
"""
|
||||||
|
Parse data found in 'Chemical and Physical properties' part of a substance page.
|
||||||
|
:param response: The response with the page to parse
|
||||||
|
:return: requests: Returns a list of properties with their values, source, etc.
|
||||||
|
"""
|
||||||
|
log.msg('parsing data', level=log.DEBUG)
|
||||||
|
requests = []
|
||||||
|
|
||||||
|
sel = Selector(response)
|
||||||
|
props = sel.xpath('//div')
|
||||||
|
|
||||||
|
for prop in props:
|
||||||
|
prop_name = ''.join(prop.xpath('b/text()').extract()) # name of property that it is parsing
|
||||||
|
if prop.xpath('a'): # parsing for single value in property
|
||||||
|
prop_source = ''.join(prop.xpath('a/@title').extract())
|
||||||
|
prop_value = ''.join(prop.xpath('a/text()').extract())
|
||||||
|
new_prop = Result({
|
||||||
|
'attribute': prop_name,
|
||||||
|
'value': prop_value,
|
||||||
|
'source': prop_source,
|
||||||
|
'reliability': 'Unknown',
|
||||||
|
'conditions': ''
|
||||||
|
})
|
||||||
|
log.msg('PubChem prop: |%s| |%s| |%s|' %
|
||||||
|
(new_prop['attribute'], new_prop['value'],
|
||||||
|
new_prop['source']), level=log.DEBUG)
|
||||||
|
requests.append(new_prop)
|
||||||
|
elif prop.xpath('ul'): # parsing for multiple values (list) in property
|
||||||
|
prop_values = prop.xpath('ul//li')
|
||||||
|
for prop_li in prop_values:
|
||||||
|
prop_value = ''.join(prop_li.xpath('a/text()').extract())
|
||||||
|
prop_source = ''.join(prop_li.xpath('a/@title').extract())
|
||||||
|
new_prop = Result({
|
||||||
|
'attribute': prop_name,
|
||||||
|
'value': prop_value,
|
||||||
|
'source': prop_source,
|
||||||
|
'reliability': 'Unknown',
|
||||||
|
'conditions': ''
|
||||||
|
})
|
||||||
|
log.msg('PubChem prop: |%s| |%s| |%s|' %
|
||||||
|
(new_prop['attribute'], new_prop['value'],
|
||||||
|
new_prop['source']), level=log.DEBUG)
|
||||||
|
requests.append(new_prop)
|
||||||
|
|
||||||
|
return requests
|
||||||
|
|
||||||
|
|
||||||
|
def new_compound_request(self, compound):
|
||||||
|
return Request(url=self.website_www[:-1] + self.search % compound, callback=self.parse)
|
@ -1,9 +1,11 @@
|
|||||||
|
import re
|
||||||
|
|
||||||
from scrapy.http import Request
|
from scrapy.http import Request
|
||||||
from scrapy import log
|
from scrapy import log
|
||||||
from source import Source
|
|
||||||
from scrapy.selector import Selector
|
from scrapy.selector import Selector
|
||||||
|
|
||||||
|
from source import Source
|
||||||
from FourmiCrawler.items import Result
|
from FourmiCrawler.items import Result
|
||||||
import re
|
|
||||||
|
|
||||||
|
|
||||||
class WikipediaParser(Source):
|
class WikipediaParser(Source):
|
||||||
@ -17,11 +19,8 @@ class WikipediaParser(Source):
|
|||||||
__spider = None
|
__spider = None
|
||||||
searched_compounds = []
|
searched_compounds = []
|
||||||
|
|
||||||
cfg = {}
|
def __init__(self, config=None):
|
||||||
|
|
||||||
def __init__(self, config={}):
|
|
||||||
Source.__init__(self, config)
|
Source.__init__(self, config)
|
||||||
self.cfg = config
|
|
||||||
|
|
||||||
def parse(self, response):
|
def parse(self, response):
|
||||||
"""
|
"""
|
||||||
@ -53,7 +52,7 @@ class WikipediaParser(Source):
|
|||||||
# scrape the chembox (wikipedia template)
|
# scrape the chembox (wikipedia template)
|
||||||
items = self.parse_chembox(sel, items)
|
items = self.parse_chembox(sel, items)
|
||||||
|
|
||||||
#scrape the drugbox (wikipedia template)
|
# scrape the drugbox (wikipedia template)
|
||||||
items = self.parse_drugbox(sel, items)
|
items = self.parse_drugbox(sel, items)
|
||||||
|
|
||||||
items = filter(lambda a: a['value'] != '', items) # remove items with an empty value
|
items = filter(lambda a: a['value'] != '', items) # remove items with an empty value
|
||||||
@ -123,7 +122,6 @@ class WikipediaParser(Source):
|
|||||||
level=log.DEBUG)
|
level=log.DEBUG)
|
||||||
return items
|
return items
|
||||||
|
|
||||||
|
|
||||||
def new_compound_request(self, compound):
|
def new_compound_request(self, compound):
|
||||||
return Request(url=self.website[:-1] + compound, callback=self.parse)
|
return Request(url=self.website[:-1] + compound, callback=self.parse)
|
||||||
|
|
||||||
@ -161,7 +159,8 @@ class WikipediaParser(Source):
|
|||||||
return links
|
return links
|
||||||
|
|
||||||
def newresult(self, attribute, value):
|
def newresult(self, attribute, value):
|
||||||
return Result({
|
return Result(
|
||||||
|
{
|
||||||
'attribute': attribute,
|
'attribute': attribute,
|
||||||
'value': value,
|
'value': value,
|
||||||
'source': 'Wikipedia',
|
'source': 'Wikipedia',
|
||||||
|
@ -6,10 +6,13 @@ class Source:
|
|||||||
website = "http://something/*" # Regex of URI's the source is able to parse
|
website = "http://something/*" # Regex of URI's the source is able to parse
|
||||||
_spider = None
|
_spider = None
|
||||||
|
|
||||||
def __init__(self, config={}):
|
def __init__(self, config=None):
|
||||||
"""
|
"""
|
||||||
Initiation of a new Source
|
Initiation of a new Source
|
||||||
"""
|
"""
|
||||||
|
self.cfg = {}
|
||||||
|
if config is not None:
|
||||||
|
self.cfg = config
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def parse(self, response):
|
def parse(self, response):
|
||||||
|
@ -10,7 +10,7 @@ class FourmiSpider(Spider):
|
|||||||
"""
|
"""
|
||||||
name = "FourmiSpider"
|
name = "FourmiSpider"
|
||||||
|
|
||||||
def __init__(self, compound=None, selected_attributes=[".*"], *args, **kwargs):
|
def __init__(self, compound=None, selected_attributes=None, *args, **kwargs):
|
||||||
"""
|
"""
|
||||||
Initiation of the Spider
|
Initiation of the Spider
|
||||||
:param compound: compound that will be searched.
|
:param compound: compound that will be searched.
|
||||||
@ -20,6 +20,9 @@ class FourmiSpider(Spider):
|
|||||||
self.synonyms = set()
|
self.synonyms = set()
|
||||||
super(FourmiSpider, self).__init__(*args, **kwargs)
|
super(FourmiSpider, self).__init__(*args, **kwargs)
|
||||||
self.synonyms.add(compound)
|
self.synonyms.add(compound)
|
||||||
|
if selected_attributes is None:
|
||||||
|
self.selected_attributes = [".*"]
|
||||||
|
else:
|
||||||
self.selected_attributes = selected_attributes
|
self.selected_attributes = selected_attributes
|
||||||
|
|
||||||
def parse(self, response):
|
def parse(self, response):
|
||||||
|
14
README.md
14
README.md
@ -23,21 +23,21 @@ documentation](http://doc.scrapy.org/en/latest/index.html).
|
|||||||
|
|
||||||
### Installing
|
### Installing
|
||||||
|
|
||||||
If you're installing Fourmi, please take a look at our [installation guide](...)
|
If you're installing Fourmi, please take a look at our installation guides
|
||||||
on our wiki. When you've installed the application, make sure to check our
|
on our [wiki](https://github.com/jjdekker/Fourmi/wiki). When you've installed the application, make sure to check our
|
||||||
[usage guide](...).
|
usage guide on the [Command Line Interface](https://github.com/jjdekker/Fourmi/wiki/CLI) and on the [Graphical User Interface](https://github.com/jjdekker/Fourmi/wiki/GUI).
|
||||||
|
|
||||||
### Using the Source
|
### Using the Source
|
||||||
|
|
||||||
To use the Fourmi source code multiple dependencies are required. Take a look at
|
To use the Fourmi source code multiple dependencies are required. Take a look at
|
||||||
the [wiki page](...) on using the application source code for a step by step
|
our [wiki pages](https://github.com/jjdekker/Fourmi/wiki) on using the application source code in our a step by step
|
||||||
installation guide.
|
installation guide.
|
||||||
|
|
||||||
When developing for the Fourmi project keep in mind that code readability is a
|
When developing for the Fourmi project keep in mind that code readability is a
|
||||||
must. To maintain the readability, code should be conform with the
|
must. To maintain the readability, code should be conform with the
|
||||||
[PEP-8](http://legacy.python.org/dev/peps/pep-0008/) style guide for Python
|
[PEP-8](http://legacy.python.org/dev/peps/pep-0008/) style guide for Python
|
||||||
code. More information about the different structures and principles of the
|
code. More information about the different structures and principles of the
|
||||||
Fourmi application can be found on our [wiki](...).
|
Fourmi application can be found on our [wiki](https://github.com/jjdekker/Fourmi/wiki).
|
||||||
|
|
||||||
### To Do
|
### To Do
|
||||||
|
|
||||||
@ -45,13 +45,9 @@ The Fourmi project has the following goals for the nearby future:
|
|||||||
|
|
||||||
__Main goals:__
|
__Main goals:__
|
||||||
|
|
||||||
- Improve our documentation and guides. (Assignee: Dekker)
|
|
||||||
- Build an graphical user interface(GUI) as alternative for the command line
|
- Build an graphical user interface(GUI) as alternative for the command line
|
||||||
interface(CLI). (Assignee: Harmen)
|
interface(CLI). (Assignee: Harmen)
|
||||||
- Compiling the source into an windows executable. (Assignee: Bas)
|
- Compiling the source into an windows executable. (Assignee: Bas)
|
||||||
- Create an configuration file to hold logins and API keys.
|
|
||||||
- Determine reliability of our data point.
|
|
||||||
- Create an module to gather data from NIST. (Assignee: Rob)
|
|
||||||
- Create an module to gather data from PubChem. (Assignee: Nout)
|
- Create an module to gather data from PubChem. (Assignee: Nout)
|
||||||
|
|
||||||
__Side goals:__
|
__Side goals:__
|
||||||
|
101
SIGNED.md
Normal file
101
SIGNED.md
Normal file
@ -0,0 +1,101 @@
|
|||||||
|
##### Signed by https://keybase.io/jdekker
|
||||||
|
```
|
||||||
|
-----BEGIN PGP SIGNATURE-----
|
||||||
|
Version: GnuPG v1.4.11 (GNU/Linux)
|
||||||
|
|
||||||
|
iQIcBAABAgAGBQJTn3GgAAoJEJrQ9RIUCT6/CI4P/RSAQrd6JugGZoQu/gNdW6eB
|
||||||
|
MYCybqYGZiieVhUaGOnFNVlp68YpXH+sP/Uc6hXEX30UQEsDmhMeT5NA7ZMS+zJ9
|
||||||
|
MNHGQdJq22lGb3+VoVBV4RTMdkQXOXvx6p5biskjIEtM3tfTxP529GvAX2TFUNnt
|
||||||
|
gGWk28EDr30M95XwDxwWo+57Xv8VtSb3VSvXEbrdwGYf8EoQo9oPtzYQ0YcdupcC
|
||||||
|
ET8bukYVcwpAjoTnPlEy89TiHHohwmimr2ASXeQ64Ks5wfjzcF7NENCAmaAfR+KI
|
||||||
|
VLLuGqdWMBx1ewVuAXTCZ0Mga/kBoRUaO0PC13UmL8LhhZY9Z3cwD4UnPU35/RQi
|
||||||
|
IbLfQcZHf/gEvyMeiTYCsyWpm+/xxn1+EfHol4/Q9VSXzZgRBX05Ik6tqeCvjdgG
|
||||||
|
4PyHBaJTTm/HfMNdg3mr1mbyjTv5UxglEyPv+Y4NdfoVfepkXsXbzvNSyVffZ3Bw
|
||||||
|
UaFp7KzIC4Jugdpv63FleiAdDY0+iZ5shH86wD1+HJ0/a87kn5Ao1yESby7J7U+f
|
||||||
|
poZQYeMFeuC0T5hY/3iYoyvZ68oH918ESESiucSulp5BvfwuqGL2+xo5uJIwGYXE
|
||||||
|
3IDQC7xbA14JHX86IVJlSHAD33iWyiC+5yjw4/bRRVl37KPsLdHiXH3YIRnF5I2I
|
||||||
|
ZbM/uDYyJdZbBe4UoCoF
|
||||||
|
=AMhi
|
||||||
|
-----END PGP SIGNATURE-----
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
<!-- END SIGNATURES -->
|
||||||
|
|
||||||
|
### Begin signed statement
|
||||||
|
|
||||||
|
#### Expect
|
||||||
|
|
||||||
|
```
|
||||||
|
size exec file contents
|
||||||
|
./
|
||||||
|
375 .gitignore d2e475a6a4fa51422cac0a07495914e776858fb9ab9c8937a4d491a3e042d6b1
|
||||||
|
464 .travis.yml 3063ba078607b8d16bd6467afc15fbbaa4b26c1e30be5ce7cef453cfccbaa95c
|
||||||
|
428 Changelog.md c7791d1914ddca9ff1549d90468a79787a7feafe94cecd756e3d7cbd4bcbc7df
|
||||||
|
FourmiCrawler/
|
||||||
|
0 __init__.py e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
|
||||||
|
304 items.py b00d49a3d53fa13306c7f8b023adb93ab88423c4fce46600689814f6b02bb806
|
||||||
|
2178 pipelines.py f9b7b84938060751e15e45de5133dffe50c798bff2a20019206fe7c9d677ad49
|
||||||
|
914 settings.py 0be2eaf8e83e85ed27754c896421180fc80cb5ce44449aa9f1048e465d1a96f2
|
||||||
|
sources/
|
||||||
|
9991 ChemSpider.py 847013e34c5c3683ec66a337837287512b4bab9fbea2ece12e4130ab0dbf264d
|
||||||
|
9898 NIST.py 97abc84fce85c47b789822715a1945ab84cc052a32340c861141c1af66bab644
|
||||||
|
4754 PubChem.py 58ed4c92519e385f2768cf8034b006b18f8a21632cb1c5a0849b1a329a8c6ffb
|
||||||
|
6907 WikipediaParser.py 5d6de911c773129a34b76c40a9b547aafc67644a15f39cd0be6afc7a16fb0f97
|
||||||
|
0 __init__.py e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
|
||||||
|
1262 source.py 16c4cdfca849b7dc2bc89d7a6f7ad021f4aa1d04234394312f1d0edf0fd9c5a4
|
||||||
|
3026 spider.py 1ffba2512988b7a6b535a4a31a4ef688ece4f8c595c3d50355c34ef46b23e44a
|
||||||
|
1081 LICENSE 36951e5f1910bad3e008ab7228f35ad8933192e52d3c3ae6a5e875765e27192c
|
||||||
|
3965 README.md d21236d6a175be28ef8e2fee8a256e95b6a513163e3f1071c26c62e9093db7f3
|
||||||
|
3676 x fourmi.py 2ff89f97fd2a49d08417d9ab6cf08e88944d0c45f54ec84550b530be48676c23
|
||||||
|
261 scrapy.cfg 624c068fd06303daa65b8e0d0d3ef88ac1f123be2694ef5b4f3f9a9dcd983f85
|
||||||
|
tests/
|
||||||
|
1 __init__.py 01ba4719c80b6fe911b091a7c05124b64eeece964e09c058ef8f9805daca546b
|
||||||
|
2837 test_configurator.py 4a0eb6e7121eb09a63ab5cb797570d1a42080c5346c3b8b365da56eefa599e80
|
||||||
|
1892 test_pipeline.py 387a336b0f36722a20e712aa033e5771c44f9e92561dd73acffd53d622c52031
|
||||||
|
1260 test_sourceloader.py b108b4b80adcdb7401273a9823b1f1a19eb5178776186eb5a9976aed8b1ee869
|
||||||
|
2113 test_spider.py 300f280377b522737be0d8e4a80031ab118a4011bdbb92131e9c400fcdab6299
|
||||||
|
utils/
|
||||||
|
0 __init__.py e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
|
||||||
|
3552 configurator.py e2b7e0ee6c1fef4373785dfe5df8ec6950f31ce6a5d9632b69a66ea3d1eaf921
|
||||||
|
2537 sourceloader.py f5a5ac2a6aba0658dbe11361f465caabcf3c06c5c8dc9a631874211cc19d2d37
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Ignore
|
||||||
|
|
||||||
|
```
|
||||||
|
/SIGNED.md
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Presets
|
||||||
|
|
||||||
|
```
|
||||||
|
git # ignore .git and anything as described by .gitignore files
|
||||||
|
dropbox # ignore .dropbox-cache and other Dropbox-related files
|
||||||
|
kb # ignore anything as described by .kbignore files
|
||||||
|
```
|
||||||
|
|
||||||
|
<!-- summarize version = 0.0.9 -->
|
||||||
|
|
||||||
|
### End signed statement
|
||||||
|
|
||||||
|
<hr>
|
||||||
|
|
||||||
|
#### Notes
|
||||||
|
|
||||||
|
With keybase you can sign any directory's contents, whether it's a git repo,
|
||||||
|
source code distribution, or a personal documents folder. It aims to replace the drudgery of:
|
||||||
|
|
||||||
|
1. comparing a zipped file to a detached statement
|
||||||
|
2. downloading a public key
|
||||||
|
3. confirming it is in fact the author's by reviewing public statements they've made, using it
|
||||||
|
|
||||||
|
All in one simple command:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
keybase dir verify
|
||||||
|
```
|
||||||
|
|
||||||
|
There are lots of options, including assertions for automating your checks.
|
||||||
|
|
||||||
|
For more info, check out https://keybase.io/docs/command_line/code_signing
|
17
fourmi.py
17
fourmi.py
@ -5,7 +5,7 @@ Fourmi, a web scraper build to search specific information for a given compound
|
|||||||
Usage:
|
Usage:
|
||||||
fourmi search <compound>
|
fourmi search <compound>
|
||||||
fourmi [options] search <compound>
|
fourmi [options] search <compound>
|
||||||
fourmi [options] [--include=<sourcename> | --exclude=<sourcename>] search <compound>
|
fourmi [options] [-v | -vv | -vvv] [--include=<sourcename> | --exclude=<sourcename>] search <compound>
|
||||||
fourmi list
|
fourmi list
|
||||||
fourmi [--include=<sourcename> | --exclude=<sourcename>] list
|
fourmi [--include=<sourcename> | --exclude=<sourcename>] list
|
||||||
fourmi -h | --help
|
fourmi -h | --help
|
||||||
@ -15,7 +15,7 @@ Options:
|
|||||||
--attributes=<regex> Include only that match these regular expressions split by a comma. [default: .*]
|
--attributes=<regex> Include only that match these regular expressions split by a comma. [default: .*]
|
||||||
-h --help Show this screen.
|
-h --help Show this screen.
|
||||||
--version Show version.
|
--version Show version.
|
||||||
--verbose Verbose logging output.
|
-v Verbose logging output. (Multiple occurrences increase logging level)
|
||||||
--log=<file> Save log to an file.
|
--log=<file> Save log to an file.
|
||||||
-o <file> --output=<file> Output file [default: results.*format*]
|
-o <file> --output=<file> Output file [default: results.*format*]
|
||||||
-f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: csv]
|
-f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: csv]
|
||||||
@ -25,8 +25,7 @@ Options:
|
|||||||
|
|
||||||
from twisted.internet import reactor
|
from twisted.internet import reactor
|
||||||
from scrapy.crawler import Crawler
|
from scrapy.crawler import Crawler
|
||||||
from scrapy import log, signals
|
from scrapy import signals, log
|
||||||
from scrapy.utils.project import get_project_settings
|
|
||||||
import docopt
|
import docopt
|
||||||
|
|
||||||
from FourmiCrawler.spider import FourmiSpider
|
from FourmiCrawler.spider import FourmiSpider
|
||||||
@ -58,15 +57,19 @@ def search(docopt_arguments, source_loader):
|
|||||||
:param source_loader: An initiated SourceLoader object pointed at the directory with the sources.
|
:param source_loader: An initiated SourceLoader object pointed at the directory with the sources.
|
||||||
"""
|
"""
|
||||||
conf = Configurator()
|
conf = Configurator()
|
||||||
conf.start_log(docopt_arguments["--log"], docopt_arguments["--verbose"])
|
conf.set_logging(docopt_arguments["--log"], docopt_arguments["-v"])
|
||||||
conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"])
|
conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"])
|
||||||
setup_crawler(docopt_arguments["<compound>"], conf.scrapy_settings, source_loader, docopt_arguments["--attributes"].split(','))
|
setup_crawler(docopt_arguments["<compound>"], conf.scrapy_settings,
|
||||||
|
source_loader, docopt_arguments["--attributes"].split(','))
|
||||||
|
if conf.scrapy_settings.getbool("LOG_ENABLED"):
|
||||||
|
log.start(conf.scrapy_settings.get("LOG_FILE"),
|
||||||
|
conf.scrapy_settings.get("LOG_LEVEL"), conf.scrapy_settings.get("LOG_STDOUT"))
|
||||||
reactor.run()
|
reactor.run()
|
||||||
|
|
||||||
|
|
||||||
# The start for the Fourmi Command Line interface.
|
# The start for the Fourmi Command Line interface.
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
arguments = docopt.docopt(__doc__, version='Fourmi - V0.5.0')
|
arguments = docopt.docopt(__doc__, version='Fourmi - V0.5.3')
|
||||||
loader = SourceLoader()
|
loader = SourceLoader()
|
||||||
|
|
||||||
if arguments["--include"]:
|
if arguments["--include"]:
|
||||||
|
@ -1,7 +1,8 @@
|
|||||||
import unittest
|
import unittest
|
||||||
|
import ConfigParser
|
||||||
|
|
||||||
from utils.configurator import Configurator
|
from utils.configurator import Configurator
|
||||||
|
|
||||||
import ConfigParser
|
|
||||||
|
|
||||||
class TestConfigurator(unittest.TestCase):
|
class TestConfigurator(unittest.TestCase):
|
||||||
|
|
||||||
@ -21,11 +22,28 @@ class TestConfigurator(unittest.TestCase):
|
|||||||
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.csv")
|
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.csv")
|
||||||
self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv")
|
self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv")
|
||||||
|
|
||||||
# def test_start_log(self):
|
def test_start_log(self):
|
||||||
# self.conf.start_log("test.log", True)
|
for i in range(0, 3):
|
||||||
# self.conf.start_log("test.log", False)
|
self.conf.set_logging("TEST", i)
|
||||||
# self.conf.start_log(None, True)
|
self.assertEqual(self.conf.scrapy_settings.get("LOG_FILE"), "TEST")
|
||||||
# self.conf.start_log(None, False)
|
if i > 0:
|
||||||
|
self.assertEqual(self.conf.scrapy_settings.get("LOG_ENABLED"), True)
|
||||||
|
if i > 1:
|
||||||
|
self.assertEqual(self.conf.scrapy_settings.get("LOG_STDOUT"), False)
|
||||||
|
else:
|
||||||
|
self.assertEqual(self.conf.scrapy_settings.get("LOG_STDOUT"), True)
|
||||||
|
else:
|
||||||
|
self.assertEqual(self.conf.scrapy_settings.get("LOG_ENABLED"), False)
|
||||||
|
self.assertEqual(self.conf.scrapy_settings.get("LOG_STDOUT"), True)
|
||||||
|
if i == 1:
|
||||||
|
self.assertEqual(self.conf.scrapy_settings.get("LOG_LEVEL"), "WARNING")
|
||||||
|
elif i == 2:
|
||||||
|
self.assertEqual(self.conf.scrapy_settings.get("LOG_LEVEL"), "INFO")
|
||||||
|
elif i == 3:
|
||||||
|
self.assertEqual(self.conf.scrapy_settings.get("LOG_LEVEL"), "DEBUG")
|
||||||
|
|
||||||
|
self.conf.set_logging(verbose=i)
|
||||||
|
self.assertEqual(self.conf.scrapy_settings.get("LOG_FILE"), None)
|
||||||
|
|
||||||
def test_read_sourceconfiguration(self):
|
def test_read_sourceconfiguration(self):
|
||||||
config = self.conf.read_sourceconfiguration()
|
config = self.conf.read_sourceconfiguration()
|
||||||
|
@ -13,6 +13,7 @@ class TestPipelines(unittest.TestCase):
|
|||||||
def test_none_pipeline(self):
|
def test_none_pipeline(self):
|
||||||
# Testing the pipeline that replaces the None values in items.
|
# Testing the pipeline that replaces the None values in items.
|
||||||
self.testItem["value"] = "abc"
|
self.testItem["value"] = "abc"
|
||||||
|
self.testItem["source"] = None
|
||||||
pipe = pipelines.RemoveNonePipeline()
|
pipe = pipelines.RemoveNonePipeline()
|
||||||
processed = pipe.process_item(self.testItem, spider.FourmiSpider())
|
processed = pipe.process_item(self.testItem, spider.FourmiSpider())
|
||||||
|
|
||||||
|
@ -47,7 +47,6 @@ class TestFoumiSpider(unittest.TestCase):
|
|||||||
self.assertGreater(len(requests), 0)
|
self.assertGreater(len(requests), 0)
|
||||||
self.assertIsInstance(requests[0], Request)
|
self.assertIsInstance(requests[0], Request)
|
||||||
|
|
||||||
|
|
||||||
def test_synonym_requests(self):
|
def test_synonym_requests(self):
|
||||||
# A test for the synonym request function
|
# A test for the synonym request function
|
||||||
self.spi._sources = []
|
self.spi._sources = []
|
||||||
|
@ -1,7 +1,8 @@
|
|||||||
from scrapy import log
|
|
||||||
from scrapy.utils.project import get_project_settings
|
|
||||||
import ConfigParser
|
import ConfigParser
|
||||||
|
|
||||||
|
from scrapy.utils.project import get_project_settings
|
||||||
|
|
||||||
|
|
||||||
class Configurator:
|
class Configurator:
|
||||||
"""
|
"""
|
||||||
A helper class in the fourmi class. This class is used to process the settings as set
|
A helper class in the fourmi class. This class is used to process the settings as set
|
||||||
@ -11,7 +12,6 @@ class Configurator:
|
|||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.scrapy_settings = get_project_settings()
|
self.scrapy_settings = get_project_settings()
|
||||||
|
|
||||||
|
|
||||||
def set_output(self, filename, fileformat):
|
def set_output(self, filename, fileformat):
|
||||||
"""
|
"""
|
||||||
This function manipulates the Scrapy output file settings that normally would be set in the settings file.
|
This function manipulates the Scrapy output file settings that normally would be set in the settings file.
|
||||||
@ -30,23 +30,34 @@ class Configurator:
|
|||||||
if fileformat is not None:
|
if fileformat is not None:
|
||||||
self.scrapy_settings.overrides["FEED_FORMAT"] = fileformat
|
self.scrapy_settings.overrides["FEED_FORMAT"] = fileformat
|
||||||
|
|
||||||
|
def set_logging(self, logfile=None, verbose=0):
|
||||||
def start_log(self, logfile, verbose):
|
|
||||||
"""
|
"""
|
||||||
This function starts the logging functionality of Scrapy using the settings given by the CLI.
|
This function changes the default settings of Scapy's logging functionality
|
||||||
|
using the settings given by the CLI.
|
||||||
:param logfile: The location where the logfile will be saved.
|
:param logfile: The location where the logfile will be saved.
|
||||||
:param verbose: A boolean value to switch between loglevels.
|
:param verbose: A integer value to switch between loglevels.
|
||||||
"""
|
"""
|
||||||
|
if verbose != 0:
|
||||||
|
self.scrapy_settings.overrides["LOG_ENABLED"] = True
|
||||||
|
else:
|
||||||
|
self.scrapy_settings.overrides["LOG_ENABLED"] = False
|
||||||
|
|
||||||
|
if verbose == 1:
|
||||||
|
self.scrapy_settings.overrides["LOG_LEVEL"] = "WARNING"
|
||||||
|
elif verbose == 2:
|
||||||
|
self.scrapy_settings.overrides["LOG_LEVEL"] = "INFO"
|
||||||
|
else:
|
||||||
|
self.scrapy_settings.overrides["LOG_LEVEL"] = "DEBUG"
|
||||||
|
|
||||||
|
if verbose > 1:
|
||||||
|
self.scrapy_settings.overrides["LOG_STDOUT"] = False
|
||||||
|
else:
|
||||||
|
self.scrapy_settings.overrides["LOG_STDOUT"] = True
|
||||||
|
|
||||||
if logfile is not None:
|
if logfile is not None:
|
||||||
if verbose:
|
self.scrapy_settings.overrides["LOG_FILE"] = logfile
|
||||||
log.start(logfile=logfile, logstdout=False, loglevel=log.DEBUG)
|
|
||||||
else:
|
else:
|
||||||
log.start(logfile=logfile, logstdout=True, loglevel=log.WARNING)
|
self.scrapy_settings.overrides["LOG_FILE"] = None
|
||||||
else:
|
|
||||||
if verbose:
|
|
||||||
log.start(logstdout=False, loglevel=log.DEBUG)
|
|
||||||
else:
|
|
||||||
log.start(logstdout=True, loglevel=log.WARNING)
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def read_sourceconfiguration():
|
def read_sourceconfiguration():
|
||||||
@ -75,7 +86,6 @@ class Configurator:
|
|||||||
elif config.defaults():
|
elif config.defaults():
|
||||||
section = config.defaults()
|
section = config.defaults()
|
||||||
if 'reliability' not in section:
|
if 'reliability' not in section:
|
||||||
log.msg('Reliability not set for %s' % sourcename,
|
print 'WARNING: Reliability not set for %s' % sourcename
|
||||||
level=log.WARNING)
|
|
||||||
section['reliability'] = ''
|
section['reliability'] = ''
|
||||||
return section
|
return section
|
||||||
|
@ -5,6 +5,7 @@ import re
|
|||||||
from FourmiCrawler.sources.source import Source
|
from FourmiCrawler.sources.source import Source
|
||||||
from utils.configurator import Configurator
|
from utils.configurator import Configurator
|
||||||
|
|
||||||
|
|
||||||
class SourceLoader:
|
class SourceLoader:
|
||||||
sources = []
|
sources = []
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user