From add4a13a4db4e5875e66fc8c179ab06c084abe0f Mon Sep 17 00:00:00 2001 From: Bas Vb Date: Sun, 6 Apr 2014 18:02:09 +0200 Subject: [PATCH] Trying to make a start with the WikipediaParser, but I can't find out with the Scrapy website (or another way) what the structure of the file should be, and how I can test/run the crawling on a page. --- FourmiCrawler/parsers/WikipediaParser.py | 33 +++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/FourmiCrawler/parsers/WikipediaParser.py b/FourmiCrawler/parsers/WikipediaParser.py index d88f4f1..b3dc36f 100644 --- a/FourmiCrawler/parsers/WikipediaParser.py +++ b/FourmiCrawler/parsers/WikipediaParser.py @@ -1,3 +1,30 @@ -__author__ = 'Bas' -__author__ = 'Nout' -#new branch \ No newline at end of file +import parser +from scrapy.selector import Selector +from FourmiCrawler.items import Result + +class WikipediaParser: + + website = "http://en.wikipedia.org/wiki/Methane" + __spider = "WikipediaParser" + + + #def __init__(self, csid): + # self.website = "http://en.wikipedia.org/wiki/{id}".format(id=csid) + + #def parse(self, response): + #self.log('A response from %s just arrived!' % response.url) + def parse(): + sel = Selector("http://en.wikipedia.org/wiki/Methane") + items = [] + item = Result() + item['attribute']="Melting point" + item['value']=site.xpath('//tr[contains(@href, "/wiki/Melting_point")]/text()').extract() + item['source']= self.website + items.append(item) + print item['attribute'] + print item['value'] + print item['source'] + print "test" + return items + + parse() \ No newline at end of file