zoukankan      html  css  js  c++  java
  • 爬取网页数据实例

    爬取拉勾网招聘的职位

    拉勾网,网址:https://www.lagou.com/

    比如我们要搜索python的职位

    previous_url :

    https://www.lagou.com/jobs/list_python/p-city_3?&cl=false&fromSearch=true&labelWords=&suginput=

    craw_url:

    https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false

    import requests
    import json
    import time
    
    previous_url = "https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput="
    craw_url = "https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false"
    
    header = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
        'referer': 'https://www.lagou.com/jobs/list_python/p-city_3?&cl=false&fromSearch=true&labelWords=&suginput=',
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    }
    
    
    
    # 建立 session
    s = requests.Session()
    # 获取搜索页的Cookies
    s.get(previous_url,headers=header,timeout=3)
    # 拿到此处获取的Cookie
    cookie = s.cookies
    # 获取此次的文本信息
    for i in range(1,16):
        payload = {
            'first': 'true',
            'pn': str(i),
            'kd': 'python',
        }
        res = s.post(craw_url,data=payload,headers=header,timeout=5).text
        recruit = json.loads(res)
        print(recruit)
        position_info = recruit.get('content').get('positionResult').get('result')
        with open('position.txt',mode='ab+') as fw:
            fw.write(json.dumps(position_info,ensure_ascii=False).encode('utf-8'))
            fw.write('
    '.encode('utf-8'))
        time.sleep(20)
    

    在爬取职位信息的时候,需要携带搜索页的cookie

    爬取红楼梦小说

    红楼梦小说网址http://www.shicimingju.com/book/hongloumeng.html

    import requests
    
    from bs4 import BeautifulSoup
    
    header = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    }
    
    response = requests.get('https://www.shicimingju.com/book/hongloumeng.html', headers=header)
    
    soup = BeautifulSoup(response.text, 'lxml')
    link_list = [
        'https://www.shicimingju.com' + li.find('a').get('href')
        for li in soup.select('.book-mulu li')
    ]
    with open('hlm.txt', mode='ab+') as fw:
        for link in link_list:
            res = requests.get(link, headers=header)
            soup2 = BeautifulSoup(res.text, 'lxml')
            fw.write((soup2.select('.bookmark-list h1')[0].text).encode('utf-8'))
            fw.write('
    '.encode('utf-8'))
            fw.write((soup2.select('.bookmark-list p')[0].text).encode('utf-8'))
            fw.write('
    '.encode('utf-8'))
    

    爬取肯德基门店信息

    import requests
    
    res = requests.get("http://www.kfc.com.cn/kfccda/storelist/index.aspx")
    
    with open('text2.html',mode='wb') as fw:
        for line in res.iter_content():
            fw.write(line)
            
    
    import requests
    import json
    
    header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
        'Referer': 'http://www.kfc.com.cn/kfccda/storelist/index.aspx',
    }
    
    res = requests.post(
        "http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx",
        params={
            'op': 'cname'
        },
        data={
            'cname': '上海',
            'pid': '',
            'keyword':'',
            'pageIndex': 1,
            'pageSize': 500
        },
        headers=header
    )
    
    kfc_info = json.loads(res.text).get('Table1')
    kfc_list = [
        {
            "storeName":kfc.get('storeName')+'餐厅',
            "addressDetail":kfc.get("addressDetail"),
            "pro":kfc.get("pro")
        }
        for kfc in kfc_info
    ]
    
    print(kfc_list)
    print(len(kfc_list)) #455
    

    爬取糗事百科段子

    糗事百科https://www.qiushibaike.com/

    import requests
    
    from bs4 import BeautifulSoup
    
    header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
        'Cookie': '_xsrf=2|c757820a|8689eab698fb588fb9f2057ccf7d7ff7|1596541908; _qqq_uuid_="2|1:0|10:1596541909|10:_qqq_uuid_|56:N2E0ODM0MzQ0MzhhMmQ0ODhiN2VkOWEzZjZlNjgwZWIwYjFhYmUyOQ==|628d31f1d77ddca4ff48407bae2999366c0a036422afa9a71656a0f181373394"; gr_user_id=48d9d1c7-67fb-403b-8bec-b830ce07b762; ff2672c245bd193c6261e9ab2cd35865_gr_session_id=706bbad7-66f7-4880-8b06-7c39369518e2; Hm_lvt_2670efbdd59c7e3ed3749b458cafaa37=1596541910; _ga=GA1.2.2084709124.1596541910; _gid=GA1.2.298303643.1596541910; ff2672c245bd193c6261e9ab2cd35865_gr_session_id_706bbad7-66f7-4880-8b06-7c39369518e2=true; grwng_uid=62b3537d-3023-4060-a4fe-9a45f7e07d67; Hm_lpvt_2670efbdd59c7e3ed3749b458cafaa37=1596542096',
    }
    
    details_list = []
    for i in range(1,14):
        url = f'https://www.qiushibaike.com/8hr/page/{i}/'
        res = requests.get(url, headers=header)
        soup = BeautifulSoup(res.text, 'lxml')
        div_list = soup.select('.recmd-right')
        for div in div_list:
            try:
                comment = div.find_all('span')[3].text
            except Exception as e:
                comment = 0
            details = {
                'subject': div.find('a').text,
                'link': 'http://www.qiushibaike.com' + div.find('a').get('href'),
                'support': div.find_all('span')[0].text,
                'comment': comment,
                'author':div.select('.recmd-name')[0].text
            }
            details_list.append(details)
    
    print(details_list)
    print(len(details_list))  # 189
    
  • 相关阅读:
    重打技术征集系统第七稿开发第1、2天
    关于centos7右上角网络图标消失的解决办法
    记录一下hbase踩的坑
    在jsp中,单选按钮的点击事件(点击隐藏或显示输入框)
    input中禁止输入特殊字符
    《软件方法》阅读笔记——1
    随手快递app开发第十七天
    Codeforces Round #284 (Div. 1) B. Name That Tune(最大流)
    hihocoder 1310岛屿(dfs,一个搜索技巧)
    Codeforces Round #294 (Div. 2) E. A and B and Lecture Rooms(lca+思维,树上寻找与给定两个点距离相等的点的个数)
  • 原文地址:https://www.cnblogs.com/surpass123/p/13435806.html
Copyright © 2011-2022 走看看