zoukankan      html  css  js  c++  java
  • python3读取HDA零售企业数据(一)

    #-*- coding:utf-8 -*-
    # 下载河南FDA各药品经营企业目录
    
    import urllib.request
    import urllib.parse
    import re
    import os
    import http.cookiejar
    
    header = {
        'Connection': 'Keep-Alive',
        'Accept': 'application/x-ms-application, image/jpeg, application/xaml+xml, image/gif, image/pjpeg, application/x-ms-xbap, */*',
        'Accept-Encoding': 'gzip, deflate',
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
        #'Referer':'http://hda.gov.cn/interplugin/face2/base.jsp',
    }
     
    
    def getOpener():
        #自动设置COOKIER
        # deal with the Cookies
        print( '正在设置cookie')    
        cj = http.cookiejar.CookieJar()
        pro = urllib.request.HTTPCookieProcessor(cj)
        opener = urllib.request.build_opener(pro, urllib.request.HTTPHandler)
        urllib.request.install_opener(opener)    
        print( '设置cookie成功')        
        return opener
    
    def download(content,pattern):
        
        
        m = re.compile(pattern)
        urls = re.findall(m,content)
        
        file_object = open('thefile.txt','a')	
        i=0
        for i,url in enumerate(urls):
            try:
                subid = url[0]
                suburl = "http://www.hda.gov.cn/interplugin/face2/content.jsp?tableId=13&tableName=TABLE13&tableView=%E8%8D%AF%E5%93%81%E9%9B%B6%E5%94%AE%E4%BC%81%E4%B8%9A&Id="+subid
                qymc = getContent(suburl,'企业名称.*
    .*83%>(.*)</td>','UTF-8')
                zcdz = getContent(suburl,'注册地址.*
    .*83%>(.*)</td>','UTF-8')
                xkzh = getContent(suburl,'许可证号.*
    .*83%>(.*)</td>','UTF-8')
                print(qymc,zcdz,xkzh)
                file_object = open('thefile.txt','a')            
                file_object.write(qymc[0])
                file_object.write(',')
                file_object.write(zcdz[0])
                file_object.write(',')            
                file_object.write(xkzh[0])
                file_object.write('
    
    ')
                
            finally:
                None
            file_object.close()
    
        print('i=',i)
    
    
    #opener = getOpener()
    
    def getContent(url,pat,charSet):
        #指定网址、正则表达式、编码方式,返回指定内容
        page = urllib.request.urlopen(url)
        content = page.read().decode(charSet)
        pattern = re.compile(pat)
        result = re.findall(pattern,content)
    
        return result
    
    if __name__ == '__main__':
        file_object = open('thefile.txt','w')   
        #1、读取首页的列表记录
        url = "http://hda.gov.cn/interplugin/face2/base.jsp?tableId=13&tableName=TABLE13&title=%D2%A9%C6%B7%C1%E3%CA%DB%C6%F3%D2%B5&bcId=137264323448453682513826398962"
    
        request = urllib.request.Request(url, headers=header)
        page    = urllib.request.urlopen(request)
        pageContent = page.read().decode('gb2312')
        #open('d:/py/test1.txt','w').write(pageContent)
        pattern = '&Id=(d{1,4})",null)>d{1,6}.(.*?)</a></p>'
        company_Name = download(pageContent,pattern)
        #2、读取第2-1183页的列表记录
        for k in range(2,1183):
            url = 'http://www.hda.gov.cn/interplugin/face2/search.jsp?tableId=13&bcId=137264323448453682513826398962&curstart='+str(k)
            print(url)
            request = urllib.request.Request(url, headers=header)
            page    = urllib.request.urlopen(request)
            pageContent = page.read().decode('UTF-8')
    
            pattern = "&Id=(d{1,4})',null)>d{1,6}.(.*?)</p>"
            company_Name = download(pageContent,pattern)
    
    
    
    
    print('药品经营企业名称下载完成!')
    

      

    
    

      

     经过几天的摸索,终于可以下到想要的数据了;

    路的的几个坑在此标下:

    1、正则表达式中的换行符 (.*)匹配时,如果遇到换行,要加入' ’

    2、调试时充分 利用 fiddler 和 python SHELL(方便粘贴)工具,即时调试;

    未解决的问题:爬取的第一个页面中有重复数据,暂未找到如何处理;

  • 相关阅读:
    Redis基础-基本数据类型
    C#特性
    C#反射
    Json序列化
    动态添加文本框并获取文本框的值
    iframe中镶嵌html页,并获取html页中的方法
    获取url中的参数
    发送邮件
    数据导入Excel表格
    处理xml模块、configparser模块、hashlib模块、subprocess模块
  • 原文地址:https://www.cnblogs.com/lrzy/p/6096386.html
Copyright © 2011-2022 走看看