Added check for already visited redirects of compounds
This commit is contained in:
parent
cb299df96f
commit
6dd03c293a
@ -13,6 +13,7 @@ class WikipediaParser(Parser):
|
|||||||
|
|
||||||
website = "http://en.wikipedia.org/wiki/*"
|
website = "http://en.wikipedia.org/wiki/*"
|
||||||
__spider = None
|
__spider = None
|
||||||
|
searched_compounds = []
|
||||||
|
|
||||||
#def __init__(self, csid):
|
#def __init__(self, csid):
|
||||||
# self.website = "http://en.wikipedia.org/wiki/{id}".format(id=csid)
|
# self.website = "http://en.wikipedia.org/wiki/{id}".format(id=csid)
|
||||||
@ -21,11 +22,17 @@ class WikipediaParser(Parser):
|
|||||||
print response.url
|
print response.url
|
||||||
log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG)
|
log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG)
|
||||||
sel = Selector(response)
|
sel = Selector(response)
|
||||||
items = self.parse_infobox(sel)
|
compound = sel.xpath('//h1[@id="firstHeading"]//span/text()').extract()[0]
|
||||||
return items
|
if compound in self.searched_compounds:
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
items = self.parse_infobox(sel)
|
||||||
|
self.searched_compounds.append(compound)
|
||||||
|
return items
|
||||||
|
|
||||||
def parse_infobox(self, sel):
|
def parse_infobox(self, sel):
|
||||||
items=[]
|
items=[]
|
||||||
|
|
||||||
tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]').xpath('normalize-space(string())')
|
tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]').xpath('normalize-space(string())')
|
||||||
prop_names = tr_list[::2]
|
prop_names = tr_list[::2]
|
||||||
prop_values = tr_list[1::2]
|
prop_values = tr_list[1::2]
|
||||||
|
Reference in New Issue
Block a user