python - How to recursively crawl whole website using scrapy -
i want crawl complete website using scrapy right crawling single page
import scrapy scrapy.http import htmlresponse scrapy.contrib.spiders import crawlspider, rule scrapy.contrib.linkextractors.sgml import sgmllinkextractor scrapy.selector import htmlxpathselector scrapy.contrib.exporter import jsonitemexporter class izodspiderspider(scrapy.spider): name = 'izodspider' allowed_domains = ['izod.com'] start_urls = ['http://izod.com/'] rules = [rule(sgmllinkextractor(), callback='parse_item', follow=true)] def parse(self, response): hxs = scrapy.selector(response) meta = hxs.xpath('//meta[@name=\'description\']/@content').extract() name = hxs.xpath('//div[@id=\'product-details\']/h5').extract() desc = hxs.xpath('//div[@id=\'product-details\']/p').extract()
is there way extract meta tags using portia ?
there error in rule definition , inside callback.
since parse function use parse_item have call inside callback instead of parse
you can find more information callback function on documentation here http://doc.scrapy.org/en/latest/topics/request-response.html?highlight=callback#topics-request-response-ref-request-callback-arguments
class izodspiderspider(crawlspider): name = "izod" depth_limit= 0 bot_name = 'izod' allowed_domains = ['izod.com'] start_urls = ['http://www.izod.com'] rules = ( rule(sgmllinkextractor(allow=('')), callback='parse_items',follow= true), ) def parse_items(self, response): hxs = scrapy.selector(response) meta = hxs.xpath('//meta[@name=\'description\']/@content').extract() name = hxs.xpath('//div[@id=\'product-details\']/h5').extract() desc = hxs.xpath('//div[@id=\'product-details\']/p').extract()
Comments
Post a Comment