zoukankan      html  css  js  c++  java
  • 51Job多页信息爬取保存

    51Job多页信息爬取

    01 导入模块

    import requests
    import chardet
    from bs4 import BeautifulSoup
    import csv
    from openpyxl import Workbook
    

    02 定义函数getOnePageInfo()

    def getOnePageInfo(url):
        # 访问链接
        res=requests.get(url,
                         headers={'User-Agent':'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'}
                        )
        # 转为beautifulsoup对象
        soup=BeautifulSoup(res.text,'html.parser')
        
    
        # 那么我们只能按照实际得到的对象来找信息
        allstring=soup.find_all('script')[-4].string
        # allstring=soup.find_all('script')[-4].text
    
    
        # 1:使用 = 分割1次, 的第二个值就是所有数据
        data=allstring.split('=',1)[-1]
        
        # 2 : 
        index=allstring.find('{')
        data2=allstring[index:]
    
        # 1使用eval()将字符串转换为相关数据
        dict_data=eval(data)
    
        bigdata = []
        for each in dict_data['engine_search_result']:
            oneInfo=[]
            # 职位名 job_name
            oneInfo.append(each.get('job_name'))
            # 公司名 commpany_name
            oneInfo.append(each.get('company_name'))
             # 薪资 providesalary_text
            oneInfo.append(each.get('providesalary_text'))
            # 工作地点 workarea_text
            oneInfo.append(each.get('workarea_text'))
            # 发布日期 updatedate
            oneInfo.append(each.get('updatedate'))
            # 公司类型 companytype_text
            oneInfo.append(each.get('companytype_text'))
            # 额外信息 attribute_text
            oneInfo.append(str(each.get('attribute_text')))
            # 所属行业 companyind_text
            oneInfo.append(each.get('companyind_text'))
            #将最后一条信息放入bigdata
            bigdata.append(oneInfo)       
    
        return bigdata
    

    03 定义存储函数MySave()

    # 存储二维列表专用类
    class MySave():
        def __init__(self):
            pass
        def saveToCsv(self,data,fileName:str,mode='w'):
            with open(fileName,mode=mode,encoding='utf-8',newline='')as f:
                csvfile=csv.writer(f)
                #写入data
                for each in data:
                    csvfile.writerow(each)
                print(fileName,'存储完成')
                
        def saveToExcel(self,data,fileName):
            #实例化工作簿对象
            wb=Workbook()
            #准备工作表
            sheet=wb.active
            #写入数据
            for each in data:
                sheet.append(each)
            wb.save(fileName)
            print(fileName,'存储完成')
    

    04 爬取多页数据

    save=MySave()
    import random
    import time
    #抓多页
    for i in range(1,11):
        time.sleep(random.randint(1,3))
    #拼接链接
        url=f'https://search.51job.com/list/020000%252c080200,000000,0000,00,9,99,%25E6%2595%25B0%25E6%258D%25AE%25E5%2588%2586%25E6%259E%2590%25E5%25B8%2588,2,{i}.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
        print(f'正在抓取第{i}页')
        #运行函数访问url,返回数据
        data=getOnePageInfo(url)
        #存储到csv
        save.saveToCsv(data,'10页数据.csv','a')
    

    完整代码

    import requests
    import chardet
    from bs4 import BeautifulSoup
    import csv
    from openpyxl import Workbook
    
    def getOnePageInfo(url):
        # 访问链接
        res=requests.get(url,
                         headers={'User-Agent':'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'}
                        )
        # 转为beautifulsoup对象
        soup=BeautifulSoup(res.text,'html.parser')
        
    
        # 那么我们只能按照实际得到的对象来找信息
        allstring=soup.find_all('script')[-4].string
        # allstring=soup.find_all('script')[-4].text
    
    
        # 1:使用 = 分割1次, 的第二个值就是所有数据
        data=allstring.split('=',1)[-1]
        
        # 2 : 
        index=allstring.find('{')
        data2=allstring[index:]
    
        # 1使用eval()将字符串转换为相关数据
        dict_data=eval(data)
    
        bigdata = []
        for each in dict_data['engine_search_result']:
            oneInfo=[]
            # 职位名 job_name
            oneInfo.append(each.get('job_name'))
            # 公司名 commpany_name
            oneInfo.append(each.get('company_name'))
             # 薪资 providesalary_text
            oneInfo.append(each.get('providesalary_text'))
            # 工作地点 workarea_text
            oneInfo.append(each.get('workarea_text'))
            # 发布日期 updatedate
            oneInfo.append(each.get('updatedate'))
            # 公司类型 companytype_text
            oneInfo.append(each.get('companytype_text'))
            # 额外信息 attribute_text
            oneInfo.append(str(each.get('attribute_text')))
            # 所属行业 companyind_text
            oneInfo.append(each.get('companyind_text'))
            #将最后一条信息放入bigdata
            bigdata.append(oneInfo)       
    
        return bigdata
        
    # 存储二维列表专用类
    class MySave():
        def __init__(self):
            pass
        def saveToCsv(self,data,fileName:str,mode='w'):
            with open(fileName,mode=mode,encoding='utf-8',newline='')as f:
                csvfile=csv.writer(f)
                #写入data
                for each in data:
                    csvfile.writerow(each)
                print(fileName,'存储完成')
                
        def saveToExcel(self,data,fileName):
            #实例化工作簿对象
            wb=Workbook()
            #准备工作表
            sheet=wb.active
            #写入数据
            for each in data:
                sheet.append(each)
            wb.save(fileName)
            print(fileName,'存储完成')
            
    save=MySave()
    import random
    import time
    #抓多页
    for i in range(1,11):
        time.sleep(random.randint(1,3))
    #拼接链接
        url=f'https://search.51job.com/list/020000%252c080200,000000,0000,00,9,99,%25E6%2595%25B0%25E6%258D%25AE%25E5%2588%2586%25E6%259E%2590%25E5%25B8%2588,2,{i}.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
        print(f'正在抓取第{i}页')
        #运行函数访问url,返回数据
        data=getOnePageInfo(url)
        #存储到csv
        save.saveToCsv(data,'10页数据.csv','a')
    
  • 相关阅读:
    Longest Palindromic Substring
    PayPal MLSE job description
    Continuous Median
    Remove Duplicates From Linked List
    Valid IP Address
    Longest substring without duplication
    Largest range
    Subarray sort
    Multi String Search
    Suffix Trie Construction
  • 原文地址:https://www.cnblogs.com/James-221/p/13770635.html
Copyright © 2011-2022 走看看