cfda数据抓取
1.网站数据是加密的,需要浏览器进行数据解析

2.网址url有js加密

3.PhantomJS无法解析数据, chrome无法获取数据,所有最终选择用Firefox浏览器
import pymysql
import time
import uuid
from lxml import etree
import logging
from selenium import webdriver
import threading
import queue
import re
logging.basicConfigfilename='shengchan.log', filemode="w", level=logging.INFO)
class App1Spiderobject):
def __init__self):
self.db = pymysql.connecthost='', port=, database='', user='',
password='', charset='utf8')
self.cursor = self.db.cursor)
self.options = webdriver.FirefoxOptions)
self.options.add_argument'--headless')
# 谷歌文档提到需要加上这个属性来规避bug
self.options.add_argument'--disable-gpu')
# 设置默认编码为utf-8
self.options.add_argument'lang=zh_CN.UTF-8')
# 隐藏滚动条, 应对一些特殊页面
self.options.add_argument'--hide-scrollbars')
# 禁止加载图片
self.options.add_argument'blink-settings=imagesEnabled=false')
# 指定浏览器分辨率
self.options.add_argument'window-size=1440x900')
self.browser = webdriver.Firefoxfirefox_options=self.options)
def mainself):
"""
入口函数
:param response:
:return:
"""
start = 1
while True:
browser = self.go_index)
if browser:
for i in rangestart, 520):
browser = self.go_pagebrowser, i)
if browser:
for j in range15):
if i > 511:
detail_html = self.go_detailbrowser, j)
if detail_html:
id = i - 1) * 15 + j + 1
self.parse_detaildetail_html, id)
else:
break
else:
start = i - 1
break
else:
continue
def go_indexself):
"""
访问主页
:return: 浏览器对象
"""
# print"!-- start index --!")
index_url = "http://app1.sfda.gov.cn/datasearch/face3/base.jsp?tableId=34&tableName=TABLE34&title=%D2%A9%C6%B7%C9%FA%B2%FA%C6%F3%D2%B5&bcId=118103348874362715907884020353"
try:
self.browser.getindex_url)
time.sleep3)
except:
# print"!-- error to get index page --!")
# print"网速不太好,休息1分钟")
time.sleep30)
return None
else:
html = self.browser.page_source
condition = re.searchr"管理局--数据查询", html)
if condition:
# print"!-- success to get index page --!")
return self.browser
else:
# print"!-- error to get index page --!----")
# print"网速不太好,休息1分钟------")
time.sleep30)
return None
def go_pageself, browser, page):
"""
跳转到指定页面
:param browser: 浏览器对象
:param page: 要跳转的页码
:return: 跳转后的浏览器对象
"""
# logging.info"!-- start page %s --!" % page)
print"!-- start page %s --!" % page)
go_page_js = 'location.href="javascript:devPage%s)";' % page
try:
browser.execute_scriptgo_page_js)
# 需要等待firefox页面加载完成
time.sleep2)
except Exception as e:
print"!-- error to go page %s --!" % page)
# logging.info"!-- error to go page %s --!" % page)
return None
else:
html = browser.page_source
condition = re.searchr"第 %s 页" % page, html)
if condition:
logging.info"!-- success to go page %s --!" % page)
return browser
else:
logging.info"!-- error to go page %s --!" % page)
return None
def go_detailself, browser, number):
"""
包含了提取详情页面数据信息,保存数据信息。
:param browser: 浏览器对象
:return: 详细数据生成器
"""
# logging.info"!-- go detail %s --!" % number)
print"!-- go detail %s --!" % number)
go_detail_js = "var div=document.getElementById'content');"
"var c=div.getElementsByTagName'a')[{detail_num}].click);"
return_list_js = 'location.href = "javascript:viewList);"'
_go_detail_js = go_detail_js.formatdetail_num=number)
browser.execute_script_go_detail_js)
time.sleep2)
detail_html = browser.page_source
condition = re.searchr"javascript:viewList", detail_html)
if condition:
browser.execute_scriptreturn_list_js)
time.sleep2)
return detail_html
else:
# logging.info"!-- error to get detail --! %s" % number)
print"!-- error to get detail --! %s" % number)
return None
def parse_detailself, detail_html, id):
# printid)
"""
详情页面提取规则
:param html: 被提取页面的html
:return: data
"""
response = etree.HTMLdetail_html)
try:
# 厂家编号
number = response.xpath'//*[@id="content"]/div/div/table[1]/tbody/tr[2]/td[2]/text)')[0].strip).replace"'", "‘")
except:
number = '00000000'
try:
# 生产地址
manufactureAddress = response.xpath'//*[@id="content"]/div/div/table[1]/tbody/tr[11]/td[2]/text)')[0].strip).replace"'", "‘")
except:
manufactureAddress = ''
try:
# 生产范围
manufactureRange = response.xpath'//*[@id="content"]/div/div/table[1]/tbody/tr[12]/td[2]/text)')[0].strip).replace"'", "‘")
except:
manufactureRange = ''
try:
# 发证日期
certificateDate = response.xpath'//*[@id="content"]/div/div/table[1]/tbody/tr[13]/td[2]/text)')[0].strip).replace"'", "‘")
except:
certificateDate = '2018-01-01'
try:
# 有效期
validityDate = response.xpath'//*[@id="content"]/div/div/table[1]/tbody/tr[14]/td[2]/text)')[0].strip).replace"'", "‘")
except:
validityDate = '2018-01-01'
try:
# 发证机关
certificateOrgan = response.xpath'//*[@id="content"]/div/div/table[1]/tbody/tr[15]/td[2]/text)')[0].strip).replace"'", "‘")
except:
certificateOrgan = ''
try:
# 签发人
Signer = response.xpath'//*[@id="content"]/div/div/table[1]/tbody/tr[16]/td[2]/text)')[0].strip).replace"'", "‘")
except:
Signer = ''
try:
# 日常监管机构
superviseAgency = response.xpath'//*[@id="content"]/div/div/table[1]/tbody/tr[17]/td[2]/text)')[0].strip).replace"'", "‘")
except:
superviseAgency = ''
try:
# 日常监管人员
superviser = response.xpath'//*[@id="content"]/div/div/table[1]/tbody/tr[18]/td[2]/text)')[0].strip).replace"'", "‘")
except:
superviser = ''
try:
# 社会信用代码/组织机构代码
socialCreditCode = response.xpath'//*[@id="content"]/div/div/table[1]/tbody/tr[3]/td[2]/text)')[0].strip).replace"'", "‘")
except:
socialCreditCode = ''
try:
# 监督举报电话
reportTel = response.xpath'//*[@id="content"]/div/div/table[1]/tbody/tr[19]/td[2]/text)')[0].strip).replace"'", "‘")
except:
reportTel = ''
try:
# 备注
comment = response.xpath'//*[@id="content"]/div/div/table[1]/tbody/tr[20]/td[2]/text)')[0].strip).replace"'", "‘")
except:
comment = ''
try:
# 分类码
classificationCode = response.xpath'//*[@id="content"]/div/div/table[1]/tbody/tr[4]/td[2]/text)')[0].strip).replace"'", "‘")
except:
classificationCode = ''
try:
# 省份
province = response.xpath'//*[@id="content"]/div/div/table[1]/tbody/tr[5]/td[2]/text)')[0].strip).replace"'", "‘")
except:
province = ''
try:
# 企业名称
companyName = response.xpath'//*[@id="content"]/div/div/table[1]/tbody/tr[6]/td[2]/text)')[0].strip).replace"'", "‘")
except:
companyName = ''
try:
# 法定代表人
legalPeople = response.xpath'//*[@id="content"]/div/div/table[1]/tbody/tr[7]/td[2]/text)')[0].strip).replace"'", "‘")
except:
legalPeople = ''
try:
# 企业负责人
companyResponsioner = response.xpath'//*[@id="content"]/div/div/table[1]/tbody/tr[8]/td[2]/text)')[0].strip).replace"'", "‘")
except:
companyResponsioner = ''
try:
# 质量负责人
qualityResponsioner = response.xpath'//*[@id="content"]/div/div/table[1]/tbody/tr[9]/td[2]/text)')[0].strip).replace"'", "‘")
except:
qualityResponsioner = ''
try:
# 注册地址
registerAddress = response.xpath'//*[@id="content"]/div/div/table[1]/tbody/tr[10]/td[2]/text)')[0].strip).replace"'", "‘")
except:
registerAddress = ''
cjrepetition = self.cursor.execute"select id from cfda_drug_company20181205 where numbers = %s" % id)
if not cjrepetition:
cjsql = "insert into cfda_drug_company20181205number, manufactureAddress, manufactureRange, certificateDate, validityDate, certificateOrgan, Signer, superviseAgency, superviser, socialCreditCode, reportTel, comment, classificationCode, province, companyName, legalPeople, companyResponsioner, qualityResponsioner, registerAddress, numbers) values'{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}', {})"
cjsql_data = cjsql.formatnumber, manufactureAddress, manufactureRange,
certificateDate, validityDate, certificateOrgan,
Signer, superviseAgency, superviser,
socialCreditCode, reportTel, comment,
classificationCode, province, companyName,
legalPeople, companyResponsioner, qualityResponsioner,
registerAddress, intid))
try:
self.cursor.executecjsql_data)
self.db.commit)
except Exception as e:
print'id:%s e:%s' % id, e))
if __name__ == '__main__':
sheng = App1Spider)
sheng.main)