# -*- coding: utf-8 -*- import scrapy from tubatu.items import TubatuItem class TubatuzxSpiderscrapy.Spider): name = 'tubatuzx' url = 'http://fs.to8to.com/company/list_' yeshu = 1 start_urls = [url + stryeshu) + '.html'] # -- http://fs.to8to.com/company/list_4.html -- def parseself, response): ss = TubatuItem) quan = response.xpath'//ul[@class="company-data-list"]/li') # printquan[1]) # print'-------------------------------------') # items = [] for sj in quan: ss = TubatuItem) name = sj.xpath'./a/div[2]/p[1]/span/text)').extract)[0] ss['name'] = name.strip) # # name = sj.xpath'./li/a/div[2]/p[1]/span/text)').extract) if lensj.xpath'./a/div[2]/p[2]/text)').extract)): # dianhua = sj.xpath'./li/a/div[2]/p[2]/text)').extract) ss['dianhua'] = sj.xpath'./a/div[2]/p[2]/text)').extract)[0] else: dianhua = '' ss['dianhua'] =' ' # ss['name'] = name[0] # ss['dianhua'] = dianhua[0] # items.appendss) # printname,dianhua) # printss) yield ss if self.yeshu < 4: self.yeshu += 1 url = self.url+strself.yeshu)+'.html' print'>>>>>>>>>>>>>>>'+ url + '<<<<<<<<<<') yield scrapy.Requesturl,callback=self.parse)
具体代码:https://github.com/mysteriousKiller/tubatu
