zoukankan      html  css  js  c++  java
  • 爬取多页数据

    '''
    @author:zl
    @contact:
    @site: https://search.51job.com/list/000000,000000,0000,00,9,99,python,2,2.html
    '''
    # _*_ coding:utf-8 _*_
    import requests
    from bs4 import BeautifulSoup
    import re
    import time
    from pymongo import MongoClient
    import xlwt
    headers = {
        'user-agent': "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36" ,
        'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
        'accept-encoding': "gzip, deflate, br",
        'accept-language': "zh-CN,zh;q=0.9",
        'cache-control': "max-age=0",
        'upgrade-insecure-requests': "1",
        'Connection': 'keep-alive',
        'Host': "search.51job.com",
    
    }
    # 获取源码
    def get_content(page):
        url = 'https://search.51job.com/list/000000,000000,0000,00,9,99,python,2,'+str(page)+'.html'
        req = requests.get(url,headers=headers)
        html = req.content.decode('gbk')
        return html
    # 获取字段
    def get(html):
        reg = re.compile(r'<p class="t1 ">.*?<a target="_blank" title="(.*?)" .*?<span class="t2"><a target="_blank" title="(.*?)" .*?<span class="t3">(.*?)</span>.*?<span class="t4">(.*?)</span>.*?<span class="t5">(.*?)</span>',re.S)
        items = re.findall(reg,html)
        return items
    # 爬到的内容写入excel
    def excel_write(items,index):
        for item in items: # 职位信息
            for i in range(0,5):
                print(item[i])
                ws.write(index, i, item[i])  # 行,列,数据
            index+=1
    if __name__ == '__main__':
        newTable = "test.xls"  # 表格名称
        wb = xlwt.Workbook(encoding='utf-8')  # 创建excel文件,声明编码
        ws = wb.add_sheet('sheet1')  # 创建表格
        headData = ['招聘职位', '公司', '地址', '薪资', '日期']  # 表头信息
        for colnum in range(0,5):
            ws.write(0,colnum,headData[colnum],xlwt.easyxf('font: bold on'))
        # 多页处理,下载到文件
        for each in range(1,10):
            index = (each-1)*50+1
            excel_write(get(get_content(each)),index)
        wb.save(newTable)
  • 相关阅读:
    Python学习笔记5
    Python字符串的encode与decode
    python代码`if not x:` 和`if x is not None:`和`if not x is None:`
    关于sys.argv
    Python学习笔记4
    Python学习笔记3
    Python学习笔记2
    生产者消费者_测试
    进程管理
    软件包管理
  • 原文地址:https://www.cnblogs.com/zhanglin123/p/9203132.html
Copyright © 2011-2022 走看看