zoukankan      html  css  js  c++  java
  • [Python爬虫]智联招聘

    import requests
    import re
    import json
    import time
    from openpyxl import workbook
    from openpyxl import load_workbook
    from pymongo import MongoClient
    from requests.exceptions import RequestException
    import csv
    import xlwt
    def get_one_page(url):
        headers={
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36',
            'Host':'search.51job.com',
            'Cookie':'partner=www_baidu_com; guid=992c8a3fa4140d299ad06533b8965bdd; 51job=cenglish%3D0%26%7C%26; nsearch=jobarea%3D%26%7C%26ord_field%3D%26%7C%26recentSearch0%3D%26%7C%26recentSearch1%3D%26%7C%26recentSearch2%3D%26%7C%26recentSearch3%3D%26%7C%26recentSearch4%3D%26%7C%26collapse_expansion%3D; search=jobarea%7E%60040000%7C%21ord_field%7E%600%7C%21recentSearch0%7E%60040000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FAC%2B%2B%BF%AA%B7%A2%B9%A4%B3%CC%CA%A6%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch1%7E%60040000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA04%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FAC%2B%2B%BF%AA%B7%A2%B9%A4%B3%CC%CA%A6%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch2%7E%60040000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA05%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FAC%2B%2B%BF%AA%B7%A2%B9%A4%B3%CC%CA%A6%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch3%7E%60030200%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FAC%2B%2B%BF%AA%B7%A2%B9%A4%B3%CC%CA%A6%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21collapse_expansion%7E%601%7C%21',
        }
        try:
            response = requests.get(url,headers=headers)
            if response.status_code==200:
                response.encoding='gbk'
                return response.text
            return None
        except RequestException:
            return None
    
    def parse_one_page(html):
        pattern=re.compile('<a target="_blank" title="(.*?)"[.sS]*?<span class="t2"><a target="_blank" title=.*?href="(.*?)">'
                           '(.*?)</a></span>[.sS]*?<span class="t3">(.*?)</span>[.sS]*?<span class="t4">(.*?)</span>',re.S)
        results=re.findall(pattern,html)
        return results
    
    def write_to_file(content):
        with open('jobs.txt','a',encoding='utf-8') as f:
            f.write(json.dumps(content,ensure_ascii=False)+'
    ')
            f.close()
    client=MongoClient()
    db=client['jobs']
    collections=db['jobs']
    
    def save_to_mongo(result):
        if collections.insert(result):
            print('Saved to Mongo')
    
    def save_to_csv(result):
        with open('data.csv','a',encoding='utf-8') as csvfile:
            fieldnames=['name','web','company','location','salary']
            writer=csv.DictWriter(csvfile,fieldnames=fieldnames)
            writer.writeheader()
            writer.writerow(result)
    def save_to_excel(result):
        global ws
        ws.append(result)
    def main(page):
        #url='https://search.51job.com/list/040000,000000,0000,00,9,99,C%252B%252B%25E5%25BC%2580%25E5%258F%2591%25E5%25B7%25A5%25E7%25A8%258B%25E5%25B8%2588,2,'+str(page)+'.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
        #url='https://search.51job.com/list/010000,000000,0000,00,9,99,%25E5%259F%25B9%25E8%25AE%25AD%25E6%259C%25BA%25E6%259E%2584,2,'+str(page)+'.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='
        url='https://search.51job.com/list/010000,000000,0000,00,9,09,%25E5%259F%25B9%25E8%25AE%25AD%25E6%259C%25BA%25E6%259E%2584,2,'+str(page)+'.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
        html=get_one_page(url)
        for content in parse_one_page(html):
            print(content)
            save_to_excel(content)
    if __name__=='__main__':
        wb=workbook.Workbook()
        ws=wb.active
        fieldnames = ['name', 'web', 'company', 'location', 'salary']
        ws.append(fieldnames)
        for i in range(1,3):
            main(i)
            time.sleep(1)
        wb.save('data.xlsx')
  • 相关阅读:
    Hibernate初学
    表分区
    单列函数
    Oracle基础
    8.28
    SpringMVC
    SpringMVC 初级操作
    试题评测
    Mybatis

  • 原文地址:https://www.cnblogs.com/lightmonster/p/11602988.html
Copyright © 2011-2022 走看看