zoukankan      html  css  js  c++  java
  • 爬虫实操

    爬取四大名著之《三国演义》

    三国演义地址

    productor

    import requests
    
    res = requests.get("https://www.shicimingju.com/book/sanguoyanyi.html")
    
    with open('text.html',mode='wb') as fw:
        for line in res.iter_content():
            fw.write(line)
    

    customer

    from bs4 import BeautifulSoup
    import requests
    
    soup = BeautifulSoup(open('text.html'), 'lxml')
    
    download_info = (
        {
            'title': li.text,
            'link': 'https://www.shicimingju.com' + li.find('a').attrs.get('href')
        }
        for li in soup.find(class_='book-mulu').find_all(name='li')
    )
    
    for item in download_info:
        article_soup = BeautifulSoup(requests.get(item.get('link', None)).text, 'lxml')
        article_div = article_soup.find(class_='bookmark-list')
        with open('sgyy.txt', mode='ab+') as fw:
            title = article_div.find(name='h1').text
            content = article_div.find(class_='chapter_content').text
            fw.write((title + '
    ' + content + '
    ').encode('utf-8'))
    

    效果图

    爬取上海市的肯德基门店信息

    上海市肯德基门店

    productor

    import requests
    
    res = requests.get("http://www.kfc.com.cn/kfccda/storelist/index.aspx")
    
    with open('text2.html',mode='wb') as fw:
        for line in res.iter_content():
            fw.write(line)
    

    customer

    import requests
    import json
    
    header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
        'Referer': 'http://www.kfc.com.cn/kfccda/storelist/index.aspx',
    }
    
    res = requests.post(
        "http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx",
        params={
            'op': 'cname'
        },
        data={
            'cname': '上海',
            'pid': '',
            'keyword':'',
            'pageIndex': 1,
            'pageSize': 500
        },
        headers=header
    )
    
    kfc_info = json.loads(res.text).get('Table1')
    kfc_list = [
        {
            "storeName":kfc.get('storeName')+'餐厅',
            "addressDetail":kfc.get("addressDetail"),
            "pro":kfc.get("pro")
        }
        for kfc in kfc_info
    ]
    
    print(kfc_list)
    print(len(kfc_list)) #455
    

  • 相关阅读:
    软件工程课程总结
    《构建之法》部分读书笔记
    软件工程课程作业
    本周软件工程课程感想
    软件工程课设迭代开发第八天
    软件工程课设迭代开发第五至七天
    软件工程课设迭代开发第四天
    软件工程课设迭代开发第三天
    软件工程课设迭代开发第二天
    Bresenham中点画圆法与二阶差分算法
  • 原文地址:https://www.cnblogs.com/surpass123/p/13429723.html
Copyright © 2011-2022 走看看