土巴兔数据爬取

# -*- coding: utf-8 -*-
import scrapy
from tubatu.items import TubatuItem

class TubatuzxSpiderscrapy.Spider):
    name = 'tubatuzx'
    url = 'http://fs.to8to.com/company/list_'
    yeshu = 1
    start_urls = [url + stryeshu) + '.html']
    # -- http://fs.to8to.com/company/list_4.html --

    def parseself, response):
        ss = TubatuItem)
        quan = response.xpath'//ul[@class="company-data-list"]/li')
        # printquan[1])
        # print'-------------------------------------')
        # items = []
        for sj in quan:
            ss = TubatuItem)
            name = sj.xpath'./a/div[2]/p[1]/span/text)').extract)[0]
            ss['name'] = name.strip)
        #     # name = sj.xpath'./li/a/div[2]/p[1]/span/text)').extract)
            if lensj.xpath'./a/div[2]/p[2]/text)').extract)):
                # dianhua = sj.xpath'./li/a/div[2]/p[2]/text)').extract)
                ss['dianhua'] = sj.xpath'./a/div[2]/p[2]/text)').extract)[0]
            else:
                dianhua = ''
                ss['dianhua'] =' '

            # ss['name'] = name[0]
            # ss['dianhua'] = dianhua[0]
            # items.appendss)

            # printname,dianhua)
            # printss)
            yield ss

        if self.yeshu < 4:
            self.yeshu += 1
            url = self.url+strself.yeshu)+'.html'
            print'>>>>>>>>>>>>>>>'+ url + '<<<<<<<<<<')
            yield scrapy.Requesturl,callback=self.parse)

具体代码:https://github.com/mysteriousKiller/tubatu

Published by

风君子

独自遨游何稽首 揭天掀地慰生平