zoukankan      html  css  js  c++  java
  • python实战项目之爬虫(一)

    因为马上就要大四实习了,博主实在懒得在学校官网上一个个翻,直接用爬虫将所有数据都爬下来

    放在表格里,这样感觉简单多了,可惜还没找到工作,so sad

    总共选择了三个学校:湖南大学,中南大学,湘潭大学

    三个项目代码分别如下(新手代码,惨不忍睹):

    湘潭大学:

    #!/usr/bin/python3
    #coding=utf-8
    import requests
    import json
    import logging
    import os
    import xlwt
    basic_url = 'http://jobs.xtu.edu.cn/index/getdaycareers?day=2018-10-'
    logging.basicConfig(level=logging.DEBUG,format='')
    
    workbook = xlwt.Workbook()
    sheet1 = workbook.add_sheet('list1')
    sheet1.write(0,0,'时间')
    sheet1.write(0,1,'地点')
    sheet1.write(0,2,'公司名称')
    sheet1.write(0,3,'专业要求')
    sheet1.write(0,5,'详细信息')
    count=1
    for i in range(1,32):
        url = basic_url+str(i)
        logging.debug('the clawer web site is:'+url)
        clawertext = requests.get(url)
        logging.debug(type(clawertext))
        logging.debug(clawertext.json())
        logging.debug(clawertext.json()['data'])
        logging.debug(type(clawertext.json()['data']))
        
        data_list = clawertext.json()['data']#the useful data 
    
        for i in data_list:
            sheet1.write(count,0,i['meet_day'])
            sheet1.write(count,1,i['address'])
            sheet1.write(count,2,i['meet_name'])
            sheet1.write(count,3,i['professionals'])
            sheet1.write(count,5,'http://jobs.xtu.edu.cn/detail/career?id='+i['career_talk_id'])
            count=count+1
    workbook.save('湘潭大学十月份招聘信息.xlsx')

    中南大学:

    这个最坑,花了我一个多小时

    #!/usr/bin/python3
    #coding=utf-8
    import requests
    import xlwt
    import json
    import logging
    import bs4
    from bs4 import BeautifulSoup
    #初始化日志保存路劲,及格式
    logging.basicConfig(filename='log.txt',level=logging.DEBUG,format='%(asctime)s - %(levelname)s - %(message)s')
    logging.getLogger('requests').setLevel(logging.WARNING)#禁用requests的日志
    
    #初始化表格
    workbook = xlwt.Workbook()
    sheet1 = workbook.add_sheet('list')
    sheet1.write(0,0,'时间')
    sheet1.write(0,1,'地点')
    sheet1.write(0,2,'公司名称')
    sheet1.write(0,3,'职位名称')
    sheet1.write(0,4,'教育水平')
    sheet1.write(0,5,'专业要求')
    sheet1.write(0,6,'空缺数量')
    sheet1.write(0,7,'详细信息')
    
    #初始化地址
    json_all_url = 'http://jobsky.csu.edu.cn/Home/SearchDateAllMonth'
    dt1={'Date':'2018-09-04'}
    post_data = requests.post(json_all_url,data=dt1)
    json_data = post_data.json()
    logging.debug(type(json_data))
    '''with open('json.txt','w') as fileTxt:
        for i in json_data:
            fileTxt.write(str(i)+'
    ')    
    '''
    basic_html_url = 'http://jobsky.csu.edu.cn/Home/ArticleDetails/'
    
    counter_all = 1
    for data in json_data:
        company_Id=data['NewsID']
        #logging.debug('the commpanyID is:'+company_Id)
        html_url=basic_html_url+company_Id
    #html_url=basic_html_url+'13713'#static url,please delete and repaire after you have used it
        
        html_txt = requests.get(html_url)
    #  logging.debug('the web site using code is:'+str(html_txt.status_code))
        bs = BeautifulSoup(html_txt.text,'lxml')
        
        #get the commpanyName
        
        list_soup_CN = bs.find('h1',attrs={'class':'text-center title'})
        try:    
            advertise_company_name=list_soup_CN.getText()
            sheet1.write(counter_all,2,advertise_company_name)
        except:
            logging.debug("the url"+html_url+'has some problem')
        #get the time and place
        try:
            list_soup_TP = bs.find('div',attrs={'id':'placeAndTime'})
            advertise_time=list_soup_TP.find('p',attrs={'class':'text-center time'}).getText()
            advertise_place=list_soup_TP.find('p',attrs={'class':'text-center place'}).getText()
            sheet1.write(counter_all,0,advertise_time)
            sheet1.write(counter_all,1,advertise_place)
        except:
            logging.debug("the url"+html_url+'has some problem')
        
        try:     
            list_soup_demand = bs.find('table',attrs={'class':'table table-bordered'})
            list_td = list_soup_demand.find_all('td')
            counter_even = 0#use to counter ,so that we can find the number of td,and get we need data
            #we can get the useful data by looking the source
            for td in list_td:
                if counter_even==1 :
                    sheet1.write(counter_all,3,td.getText())
                if counter_even==3 :
                    sheet1.write(counter_all,4,td.getText())
                if counter_even==5 :
                    sheet1.write(counter_all,5,td.getText())
                if counter_even==7 :
                    sheet1.write(counter_all,6,td.getText())
                counter_even =counter_even+1
            sheet1.write(counter_all,7,html_url)
            counter_all+=1
        except:
            logging.debug("the url"+html_url+'has some problem')
        #保存文件
        workbook.save('中南大学招聘信息.xlsx')
    View Code

    最后是湖南大学,不知道为什么,湖南大学招聘信息少的可怜

    #!/usr/bin/python3
    #coding=utf-8
    import requests
    import json
    import logging
    import os
    import xlwt
    json_url = 'https://hnu.bysjy.com.cn/module/getcareers?start_page=1&keyword=&type=inner&day=&count=15&start=1&_=1536044186160'
    logging.basicConfig(level=logging.DEBUG,format='')
    
    json_data = requests.get(json_url)
    #print(json_data.text)
    
    workbook = xlwt.Workbook()
    sheet1 = workbook.add_sheet('list1')
    sheet1.write(0,0,'时间')
    sheet1.write(0,1,'地点')
    sheet1.write(0,2,'公司名称')
    sheet1.write(0,3,'招聘会')
    sheet1.write(0,4,'专业要求')
    sheet1.write(0,6,'详细信息')
    count=1
        
    data_list = json_data.json()['data']#the useful data 
    
    for i in data_list:
            sheet1.write(count,0,i['meet_day']+i['meet_time'])
            sheet1.write(count,1,i['address'])
            sheet1.write(count,2,i['company_name'])
            sheet1.write(count,3,i['meet_name'])
            sheet1.write(count,4,i['professionals'])
            sheet1.write(count,5,'https://hnu.bysjy.com.cn/detail/career?id='+i['career_talk_id'])
            count=count+1
    workbook.save('湖南大学招聘信息.xlsx')
    View Code
  • 相关阅读:
    PHP 生成随机字符串与唯一字符串
    大数据和AI怎么与现代教育相结合?
    大数据和AI怎么与现代教育相结合?
    python正常时间和unix时间戳相互转换的方法
    python正常时间和unix时间戳相互转换的方法
    人人都谈大数据,你考虑过小数据的感受吗
    人人都谈大数据,你考虑过小数据的感受吗
    浅析机器学习的主题模型和语义分析
    浅析机器学习的主题模型和语义分析
    大数据对当代企业推广价值何在
  • 原文地址:https://www.cnblogs.com/gambler/p/9584689.html
Copyright © 2011-2022 走看看