zoukankan      html  css  js  c++  java
  • 数据分析数据获取(数据分析师岗位分析)

    import requests
    import re
    import time
    from lxml import etree
    import pymysql
    class my_spider:
        
        #初始化(第一步)
        def __init__(self,num1,num2):
            self.base_url = "https://search.51job.com/list/000000,000000,0000,00,9,99,%25E6%2595%25B0%25E6%258D%25AE%25E5%2588%2586%25E6%259E%2590%25E5%25B8%2588,2,{}.html"
            self.headers = {
                "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
                "Accept-Encoding":"gzip,deflate,br",
                "Accept-Language":"zh-CN,zh;q=0.9",
                "Cache-Control":"max-age=0",
                "Connection":"keep-alive",
                "Host":"search.51job.com",
                "Sec-Fetch-Mode":"navigate",
                "Sec-Fetch-Site":"none",
                "Sec-Fetch-User":"?1",
                "Upgrade-Insecure-Requests":"1",
                "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36"
            }
            self.page_num1 = num1
            self.page_num2 = num2
            #定义一个存放详细页连接的列表,方便取数
            self.det_link = []
        #构建页面列表连接(第一步)
        def get_url(self):
            url_List = []
            for i in range(self.page_num1,self.page_num2):
                url_List.append(self.base_url.format(i))
            return url_List
        #获得主列表页面信息(第一步)
        def get_pages(self,url):
            proxy={
        "http":"http://H53AXE0994W90HAD:720EEA0408F81FA2@http-dyn.abuyun.com:9020",
        "https":"http://H53AXE0994W90HAD:720EEA0408F81FA2@http-dyn.abuyun.com:9020"
    }
            response = requests.get(url=url,headers=self.headers)
            #print(response.content.decode('gbk'))
            
            return self.parse_pages(response.content.decode('gbk'))
        
        #解析主列表信息连接(第一步)
        def parse_pages(self,text):
            
            html_5job = etree.HTML(text)
            all_div = html_5job.xpath("//div[@id='resultList']//div[@class='el']")
            info_List = []
    
            for item in all_div:
                info = {}
                info['job_info_link'] = item.xpath("./p/span/a/@href")[0]            
                info_List.append(info)
             
            return info_List
            
        
        
        
    
       #定义函数循环抽取页面信息
        def run(self):
            index_urlList = self.get_url()
            #print(index_urlList)
            for url in index_urlList:
                time.sleep(1)
                page_info = self.get_pages(url)
                #print(page_info,"打印结果")
                for job_info_link in page_info:
                    self.det_link.append(job_info_link['job_info_link'])
            
            
                    
        
        #获得页面信息
        def get_page_info(self,url):
            url = url
            print(url)
            proxy={
        "http":"http://H53AXE0994W90HAD:720EEA0408F81FA2@http-dyn.abuyun.com:9020",
        "https":"http://H53AXE0994W90HAD:720EEA0408F81FA2@http-dyn.abuyun.com:9020"
    }
            response = requests.get(url=url,headers=self.headers)
            #print(response.content.decode('gbk'))
            return self.parse_det_info(response.content.decode('gbk'))
        
        
        #解析详细信息
        def parse_det_info(self,pages):
            
            item = etree.HTML(pages)
            #all_div = html_51job.xpath("//div[@class='cn']")
            #print(all_div)
            #info_List = []
    
            #for item in all_div:
            info = {}
            try:
                info['job_name'] = item.xpath("//div[@class='cn']/h1/@title")[0]
            except IndexError:
                info['job_name'] = 'NaN'
            try:
                info['job_money'] = item.xpath("//div[@class='cn']/strong/text()")[0]  # 这里报错
            except IndexError:
                info['job_money'] = 'NaN'
            try:
                info['company_name'] = item.xpath("//div[@class='cn']/p[@class='cname']/a/@title")[0]
            except IndexError:
                info['company_name'] = 'NaN'
            try:
                info['job_request'] = item.xpath("//div[@class='cn']/p[@class='msg ltype']/@title")[0]
            except IndexError:
                info['job_request'] = 'NaN'    
                #info_List.append(info)
             
            return info
        
        #main
        def main(self):
            self.run()
            print(self.det_link)
            for url in self.det_link:
                #print(url)
                time.sleep(1)
                det_pageinfo = self.get_page_info(url)
                print(det_pageinfo)
                self.save_to_mysql(det_pageinfo)
            
                
                
                
                
                
                
                
        #保存数据
        def save_to_mysql(self, page_Info):
            # 链接数据库
            conn = pymysql.connect(host='localhost', user='root', passwd='root123', db='baidu', port=3306)
    
            # 游标对象
            cursor = conn.cursor()
    
            # 插入数据
            tt = page_Info
            cursor.execute("insert into det_job_info(job_name,company_name,job_money,job_request) VALUES('{}','{}','{}','{}')".format(tt['job_name'],tt['company_name'],tt['job_money'],tt['job_request']))
            conn.commit()
            # 关闭游标,关闭连接
            cursor.close()
            conn.close()
    if __name__ == "__main__":
        
        #spider.get_pages()
        #spider.get_url()
        for i in range(159,159,2):
            time.sleep(1)
            spider = my_spider(159,161)
            print('正在获取{}-{}页数据'.format(i,i+2))
            spider.main()
  • 相关阅读:
    如何使用Win32API绘制树
    MSD_radix_sort
    Radix Sort
    Computer Science: the Big Picture
    天问之Linux内核中的不明白的地方
    viewController备注
    loadView、viewDidLoad及viewDidUnload的关系(转)
    iOS 中的视图函数 init initwithnib viewDidLoad viewWillAppear的总结
    做ios工程时,把UI从xib移动到代码中遇到的问题
    嘟嘟三期的所学所想
  • 原文地址:https://www.cnblogs.com/luweilehei/p/11446485.html
Copyright © 2011-2022 走看看