I want to get website addresses of some jobs, so I write a scrapy spider, I want to get all of the value with xpath://article/dl/dd/h2/a[@class="job-title"]/@href, but when I execute the spider with command :
scrapy spider auseek -a addsthreshold=3 the variable "urls" used to preserve values is empty, can someone help me to figure it,
here is my code:
from scrapy.contrib.spiders import CrawlSpider,Rule from scrapy.selector import Selector from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor from scrapy.conf import settings from scrapy.mail import MailSender from scrapy.xlib.pydispatch import dispatcher from scrapy.exceptions import CloseSpider from scrapy import log from scrapy import signals from myProj.items import ADItem import time class AuSeekSpider(CrawlSpider): name = "auseek" result_address = [] addressCount = int(0) addressThresh = int(0) allowed_domains = ["seek.com.au"] start_urls = [ "http://www.seek.com.au/jobs/in-australia/" ] def __init__(self,**kwargs): super(AuSeekSpider, self).__init__() self.addressThresh = int(kwargs.get('addsthreshold')) print 'init finished...' def parse_start_url(self,response): print 'This is start url function' log.msg("Pipeline.spider_opened called", level=log.INFO) hxs = Selector(response) urls = hxs.xpath('//article/dl/dd/h2/a[@class="job-title"]/@href').extract() print 'urls is:',urls print 'test element:',urls[0].encode("ascii") for url in urls: postfix = url.getAttribute('href') print 'postfix:',postfix url = urlparse.urljoin(response.url,postfix) yield Request(url, callback = self.parse_ad) return def parse_ad(self, response): print 'this is parse_ad function' hxs = Selector(response) item = ADItem() log.msg("Pipeline.parse_ad called", level=log.INFO) item['name'] = str(self.name) item['picNum'] = str(6) item['link'] = response.url item['date'] = time.strftime('%Y%m%d',time.localtime(time.time())) self.addressCount = self.addressCount + 1 if self.addressCount > self.addressThresh: raise CloseSpider('Get enough website address') return item The problems is:
urls = hxs.xpath('//article/dl/dd/h2/a[@class="job-title"]/@href').extract() urls is empty when I tried to print it out, I just cant figure out why it doesn't work and how can I correct it, thanks for your help.