zoukankan      html  css  js  c++  java
  • 爬取多页数据

    '''
    @author:zl
    @contact:
    @site: https://search.51job.com/list/000000,000000,0000,00,9,99,python,2,2.html
    '''
    # _*_ coding:utf-8 _*_
    import requests
    from bs4 import BeautifulSoup
    import re
    import time
    from pymongo import MongoClient
    import xlwt
    headers = {
        'user-agent': "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36" ,
        'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
        'accept-encoding': "gzip, deflate, br",
        'accept-language': "zh-CN,zh;q=0.9",
        'cache-control': "max-age=0",
        'upgrade-insecure-requests': "1",
        'Connection': 'keep-alive',
        'Host': "search.51job.com",
    
    }
    # 获取源码
    def get_content(page):
        url = 'https://search.51job.com/list/000000,000000,0000,00,9,99,python,2,'+str(page)+'.html'
        req = requests.get(url,headers=headers)
        html = req.content.decode('gbk')
        return html
    # 获取字段
    def get(html):
        reg = re.compile(r'<p class="t1 ">.*?<a target="_blank" title="(.*?)" .*?<span class="t2"><a target="_blank" title="(.*?)" .*?<span class="t3">(.*?)</span>.*?<span class="t4">(.*?)</span>.*?<span class="t5">(.*?)</span>',re.S)
        items = re.findall(reg,html)
        return items
    # 爬到的内容写入excel
    def excel_write(items,index):
        for item in items: # 职位信息
            for i in range(0,5):
                print(item[i])
                ws.write(index, i, item[i])  # 行,列,数据
            index+=1
    if __name__ == '__main__':
        newTable = "test.xls"  # 表格名称
        wb = xlwt.Workbook(encoding='utf-8')  # 创建excel文件,声明编码
        ws = wb.add_sheet('sheet1')  # 创建表格
        headData = ['招聘职位', '公司', '地址', '薪资', '日期']  # 表头信息
        for colnum in range(0,5):
            ws.write(0,colnum,headData[colnum],xlwt.easyxf('font: bold on'))
        # 多页处理,下载到文件
        for each in range(1,10):
            index = (each-1)*50+1
            excel_write(get(get_content(each)),index)
        wb.save(newTable)
  • 相关阅读:
    docker镜像制作及上传到远端镜像仓库
    mysql索引进阶
    电子商务需要用到香港服务器吗?
    golang module goland 配置代理
    nginx做linux服务时,日志有权限提示没权限(nginx: [emerg] open() "/home/www/log/error.log" failed)
    Yaml 、Json 、Dict 之间的转化
    CodeSmith .NET三层架构模板
    C#获取26个英文字母
    基于PCASClass.js和layui.js的城市三级联动
    MySQL变量的使用
  • 原文地址:https://www.cnblogs.com/zhanglin123/p/9203132.html
Copyright © 2011-2022 走看看