zoukankan      html  css  js  c++  java
  • 简单爬虫爬去51job职位

    #-*- coding:utf-8 -*-
    from urllib import request
    from bs4 import BeautifulSoup
    from urllib import parse
    import pymysql
    from sqlalchemy import *
    from sqlalchemy.orm import *
    def getYao(url):
        url = url
        urlFirst = request.Request(url)
        urlFirst.add_header("User-Agent",
                       "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36")
        urlFirst.add_header("Origin", "http://search.51job.com")
        postData = parse.urlencode([
            ("s", "01"),
            ("t", "0"),
        ])
        print(postData)
        return_ = request.urlopen(urlFirst, data=postData.encode("gbk"))
    
        contentNei = return_.read().decode("gbk")
        neisp = BeautifulSoup(contentNei,"html.parser")
        return neisp.find("div",class_="job_msg").get_text()
    engine=create_engine("mysql://root:root@localhost:3306/laravel?charset=utf8",echo=True)
    metadata=MetaData(engine)
    users_table = Table("jobs",metadata,autoload=True)
    for i in list(range(1,11)):
        url = "http://search.51job.com/list/000000,000000,0000,00,9,99,C,2,"+str(i)+".html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=102&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="
        rep = request.Request(url)
        rep.add_header("User-Agent","Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36")
        rep.add_header("Origin","http://search.51job.com")
        postData = parse.urlencode([
            ("fromJs", "1"),
            ("jobarea", "040000"),
            ("keyword", "php"),
            ("keywordtype", "2"),
            ("lang", "c"),
            ("stype", "2"),
            ("postchannel", "0000"),
            ("fromType", "1"),
            ("confirmdate", "9")
        ])
        print(postData)
        return_ = request.urlopen(rep,data=postData.encode("gbk"))
        content = return_.read().decode("gbk")
        sp = BeautifulSoup(content,"html.parser")
        print(content)
        sql_moban = users_table.insert()
        info_set = set([])
        j = 0
        for i in sp.find("div",class_="dw_table").find_all("div",class_="el"):
          if j==0:
              j = j + 1
              continue
          j = j + 1
          getYao(i.find('a').get('href'))
          result = sql_moban.execute(zhiwei=i.find("a").get_text().strip(), company=i.find("span",class_="t2").string,address=i.find("span",class_="t3").string,slary=i.find("span",class_="t4").string,riqi=i.find("span",class_="t5").string,yaoqiu=getYao(i.find('a').get('href')))
        print("下载完成")
        print(info_set)
    
    #print(sql_moban)

    将爬到的数据写到数据库中,php工资真不高;

  • 相关阅读:
    在线程中进行读取并写入文件和wenjia
    Java里的IO流里的FileInputStream 的读取并在前打印行数!
    C++基础知识(前言)
    linux shell 笔记
    AngularJS图片上传功能的实现
    jQuery中事件绑定
    项目实践中--Git服务器的搭建与使用指南
    javascript跨浏览器事件对象类库
    完美运动框架(js)
    浅谈js中继承的理解和实现
  • 原文地址:https://www.cnblogs.com/summerkxy/p/7083682.html
Copyright © 2011-2022 走看看