zoukankan      html  css  js  c++  java
  • 简陋的党政网通用爬虫

    #   简陋的自适应爬虫,需要参数:
    #           类Pa : { urlone : 搜索的第一页url ,
    #                   pagenum : 需要爬取的页面数目
    #                                               }
    #           类方法getinfo_url : {  signlist : 标签列表(需要提取的内容所在的标签嵌套顺序 , 如['div','a']),
    #                                  classname : 与上面signlist一一对应 , 表示上标签的"class="的值 ,如['aa',None],"None"表示没有class
    #                                               }
    #           类方法write_info  : {  h_sign : 详情页面,标题所在标签列表 如['div','h']
    #                                  h_name : 与上面一一对应,表示标签所对应的class值      理解同上
    #                                  c_sign : 内容的标签列表
    #                                  c_name :内容的标签列表所对应的class值
    #                                                   }
    #
    class Pa:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko Core/1.70.3676.400 QQBrowser/10.4.3505.400'
        }
        def __init__(self,urlone,pagenum):
            self.urlstart = urlone
            self.info_url = []
            self.pagenum = pagenum
            # print('创建好类!')
        #提取详细内容的网页,第一个参数为标签(<a>,<div>) , 第二个参数为class的名字(class = 'aa'  中的'aa')
        def getinfo_url(self,signlist=[],classname=[]):
            # print('getinfo_url方法!')
            url_split = self.urlstart.split('&p=1')
            for pageindex in range(1,self.pagenum+1):
                try:
                    print('正在爬取页面url pageindex:',pageindex)
                    #初始搜索url
    
                    # print(url_split)
                    url1 = url_split[0] + '&p={}'.format(pageindex) + url_split[1]
    
                    # print('url1:',url1)
    
                    res = requests.get(url1,headers=headers)
                    res.encoding='utf-8'
                    # print(res.text)
                    page = etree.HTML(res.text)
                    #具体文章url
                    xpathstr = ''
                    for index in range(len(signlist)):
                        if classname[index] == None:
                            xpathstr += '/{}' .format(signlist[index])
                        else:
                            xpathstr +=  '/{}[@class = "{}"]'.format(signlist[index], classname[index])
                        # print(index,xpathstr)
                    xpathstr = '/' + xpathstr + '//@href'
    
                    # print('xpathstr:',xpathstr)
    
                    dataurl = page.xpath(xpathstr)
                    # print('提取的dataurl:',dataurl)
                    if dataurl == []:
                        print('最后一页停止')
                        break
                    dataurl = list(set(dataurl))
                    # print('index,原始dataurl:',index, dataurl)
                    for everurl_index in range(len(dataurl)):
                        # print('everurl_index:',everurl_index)
                        # print('测试:',dataurl[everurl_index])
                        # 如果是短链接,缺少主页url
                        if dataurl[everurl_index][0] == '/':
                            url_first = re.findall(r"https://(.*?)/", self.urlstart)
                            if url_first == []:
                                url_first = re.findall(r"http://(.*?)/", self.urlstart)
                                dataurl[everurl_index] = 'http://' + url_first[0] + dataurl[everurl_index]
                            else:
                                dataurl[everurl_index] = 'https://' + url_first[0] + dataurl[everurl_index]
    
                    # print('加工后dataurl:', dataurl)
                    self.info_url.extend(dataurl)
                    # print('info_url长度:',len(self.info_url))
                except:
                    print('爬取url异常:',pageindex)
        #将详细内容网页中的content提取出来,参数分别为 标题的标签 ,标题标签class的名字(同上), 内容的标签,内容标签中类的名字
        def write_info(self,path = './',h_sign=[],h_name=[],c_sign=[],c_name=[]):
            # print('write函数')
            if self.info_url == []:
                print('没有可读取的url,请先运行"getinfo_url"函数')
            else:
                index = 0
                for url_i in self.info_url:
                    try:
                        content = ''
                        index += 1
                        # print('正在写入数据:',index)
                        # print(url_i)
                        res = requests.get(url_i, headers=headers)
                        res.encoding = 'utf-8'
                        # print(res.text)
                        page = etree.HTML(res.text)
                        #提取header的xpath
                        h_str = '/'
                        for h_index in range(len(h_sign)):
                            if h_name[h_index] == None:
                                h_str += '/{}'.format(h_sign[h_index])
                            else:
                                h_str += '/{}[@class = "{}"]'.format(h_sign[h_index],h_name[h_index])
                        h_str = h_str + '//text()'
    
                        # print('hstr:',h_str)
    
                        h_element = page.xpath(h_str)
    
                        # print('h_el:',h_element)
    
                        headtext = h_element[0]
                        content += h_element[0] + '
    '
                        # print('head:',h_element[0])
                        #提取content的xpath
                        c_str = '/'
    
                        for c_index in range(len(c_sign)):
    
                            if c_name[c_index] == None:
                                c_str += '/{}'.format(c_sign[c_index])
                            else:
                                c_str += '/{}[@class = "{}"]'.format(c_sign[c_index], c_name[c_index])
                        c_str2 = c_str + '//text()'
                        if  page.xpath(c_str2)==[]:
                            c_str = c_str + '//text()'
                        else:
                            c_str = c_str2
    
                        # print('c_str:',c_str)
    
                        c_element = page.xpath(c_str)
    
                        # print('c_ele:',c_element)
    
                        # print('flag:',c_element == [])
                        for i in c_element:
                            content += '
    '
                            content += '	'+ i
                        # print('content:',content)
                        filename = '{}/{}_{}.txt'.format(path,index,headtext.strip())
                        # print('filename:',filename)
                        with open(filename,'w+',encoding='utf-8') as f:
                            f.write(content)
                        print('写入成功:',index)
                    except:
                        print('写入异常:',index)

    参数导入json:

    twofile = {
                'sum':['pagenum','test_url1','test_sign','test_classname','test_h_sign','test_h_name','test_c_sign', 'test_c_name','path'],
                'pagenum':500,
                'test_url':'http://sousuo.gov.cn/s.htm?q=%E4%BF%A1%E6%81%AF%E5%AE%89%E5%85%A8&n=10&p=1&t=govall&timetype=timeqb&mintime=&maxtime=&sort=&sortType=1&nocorrect=',
                'test_sign':['h3','a'],
                'test_classname':['res-title',None],
                'test_h_sign':['div','h1'],
                'test_h_name':['article oneColumn pub_border',None],
                'test_c_sign':['div','p'],
                'test_c_name':['pages_content',None],
                'path' : 'data3'
               }
    #转成json格式
    twofile = json.dumps(twofile)
    #写入文件
    with open('canshu/twofile.json','w+') as f:
            f.write(twofile)

    调用函数:

    #调用函数
    def reslut():
        dir = os.listdir('canshu')
        for file in dir:
            print(file)
            filepath = 'canshu/'+file
            with open(filepath,'r') as f:
                 canshu = f.read()
            canshu = json.loads(canshu)
            # print(canshu)
            test_url = canshu['test_url']
            #提取函数getinfo_url的参数设定
            test_sign = canshu['test_sign']
            test_classname = canshu['test_classname']
            #写入函数write_info的参数
            test_h_sign = canshu['test_h_sign']
            test_h_name = canshu['test_h_name']
            test_c_sign = canshu['test_c_sign']
            test_c_name = canshu['test_c_name']
            path = canshu['path']
            pagenum = canshu['pagenum']
            # print('test_sign:',test_sign,type(test_sign))
            # print('test_classname:', test_classname, type(test_classname))
            test1 = Pa(test_url , pagenum)
            test1.getinfo_url(test_sign ,test_classname)
            test1.write_info(path ,test_h_sign ,test_h_name ,test_c_sign ,test_c_name)
    reslut()
  • 相关阅读:
    DAL层中根据ID删除方法(常用)
    DAL层联合查询及条件查询方法(常用)
    WPF中DataGrid在没有数据的时候也可以显示水平滚动条
    C#使用SharpZipLib创建压缩文件,并指定压缩文件夹路径(解决SharpZipLib压缩长路径显示问题)
    WPF的DataGrid的某个列绑定数据的三种方法(Binding、Converter、DataTrigger)
    WPF实现背景透明磨砂,并通过HandyControl组件实现弹出等待框
    C#使用FileSystemWatcher来监控指定文件夹,并使用TCP/IP协议通过Socket发送到另外指定文件夹
    C#使用Parallel处理数据同步写入Datatable并使用BulkInsert批量导入数据库
    C#编写运行在Linux环境下的采用Mediainfo来获取多媒体文件信息的代码
    C#使用iTextSharp+ZXing.Net+FreeSpire.PDF生成和打印pdf文档
  • 原文地址:https://www.cnblogs.com/cxhzy/p/10910097.html
Copyright © 2011-2022 走看看