#-*- coding:utf-8 -*- #读取北京FDA的药品经营企业数据 # 20161125 zhangshaohua import re import urllib.request import urllib.parse import os def getContent(url,pat,charSet): #指定网址、正则表达式、编码方式,返回指定内容 page = urllib.request.urlopen(url) content = page.read().decode(charSet) pattern = re.compile(pat) result = re.findall(pattern,content) return result #读取首页 url = 'http://www.bjda.gov.cn/eportal/ui?pageId=331148' #取总记录数,每页20条 zjls = getContent(url,'总记录数:(d{1,5}),','UTF-8') vdzjls = int(zjls[0]) vdzjls = int(round(vdzjls/20,0)) for i in range(51,vdzjls): url = 'http://www.bjda.gov.cn/eportal/ui?pageId=331148¤tPage='+str(i) pattern = 'artileId=(.*)">查看' page_id = getContent(url,pattern,'UTF-8') for url_id in page_id: try: subid = url_id suburl = "http://www.bjda.gov.cn/eportal/ui?pageId=331631&artileId="+subid qymc = getContent(suburl,'企业名称:</th> .*?<td>(.*?)</td>','UTF-8') zcdz = getContent(suburl,'注册地址:</th> .*?<td>(.*?)s{0,3}</td>','UTF-8') xkzh = getContent(suburl,'许可证号:</th> .*?<td>(.*?)</td>','UTF-8') print(qymc,zcdz,xkzh) file_object = open('bjda.txt','a') file_object.write(qymc[0]) file_object.write(',') file_object.write(zcdz[0]) file_object.write(',') file_object.write(xkzh[0]) file_object.write(' ') finally: None file_object.close() vdzjls = int(zjls[0]) print('药品零售企业读取完成!')
经历了读取HDA的练习,此次读取BJ的数据开始比较顺畅。在读取996条数据时出错,再次出现换行造成的问题;
多次试错后用‘s{0,3}’成功解决.
正则表达式要继续学习,才能不断进步,避免遇“”坑“”时能顺利通过!