zoukankan      html  css  js  c++  java
  • python学习之爬取豌豆荚

    '''
    主页:
        图标地址、下载次数、大小、详情页地址
    
    详情页:
        游戏名、好评率、评论数、小编点评、下载地址、简介、网友评论、1-5张截图链接地址、
    https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=1&ctoken=FRsWKgWBqMBZLdxLaK4iem9B
    
    https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=2&ctoken=FRsWKgWBqMBZLdxLaK4iem9B
    
    https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=3&ctoken=FRsWKgWBqMBZLdxLaK4iem9B
    
    32
    '''
    import requests
    from bs4 import BeautifulSoup
    # 1、发送请求
    def get_page(url):
        response = requests.get(url)
        return response
    
    # 2、开始解析
    # 解析详情页
    def parse_detail(text):
        soup = BeautifulSoup(text, 'lxml')
        # print(soup)
    
        # app名称
        name = soup.find(name="span", attrs={"class": "title"}).text
        # print(name)
    
        # 好评率
        love = soup.find(name='span', attrs={"class": "love"}).text
        # print(love)
    
        # 评论数
        commit_num = soup.find(name='a', attrs={"class": "comment-open"}).text
        # print(commit_num)
    
        # 小编点评
        commit_content = soup.find(name='div', attrs={"class": "con"}).text
        # print(commit_content)
    
        # app下载链接
        download_url = soup.find(name='a', attrs={"class": "normal-dl-btn"}).attrs['href']
        # print(download_url)
    
        print(
            f'''
            ============= tank ==============
            app名称:{name}
            好评率: {love}
            评论数: {commit_num}
            小编点评: {commit_content}
            app下载链接: {download_url}
            ============= end ==============
            '''
        )
    
    
    
    # 解析主页
    def parse_index(data):
        soup = BeautifulSoup(data, 'lxml')
    
        # 获取所有app的li标签
        app_list = soup.find_all(name='li', attrs={"class": "card"})
        for app in app_list:
            # print(app)
            # print('tank' * 1000)
            # print('tank *' * 1000)
            # print(app)
            # 图标地址
            # 获取第一个img标签中的data-original属性
            img = app.find(name='img').attrs['data-original']
            print(img)
    
            # 下载次数
            # 获取class为install-count的span标签中的文本
            down_num = app.find(name='span', attrs={"class": "install-count"}).text
            print(down_num)
    
            import re
            # 大小
            # 根据文本正则获取到文本中包含 数字 + MB(d+代表数字)的span标签中的文本
            size = soup.find(name='span', text=re.compile("d+MB")).text
            print(size)
    
            # 详情页地址
            # 获取class为detail-check-btn的a标签中的href属性
            # detail_url = soup.find(name='a', attrs={"class": "name"}).attrs['href']
            # print(detail_url)
    
            # 详情页地址
            detail_url = app.find(name='a').attrs['href']
            print(detail_url)
    
            # 3、往app详情页发送请求
            response = get_page(detail_url)
    
            # 4、解析app详情页
            parse_detail(response.text)
    
    
    def main():
        for line in range(1, 33):
            url = f"https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page={line}&ctoken=FRsWKgWBqMBZLdxLaK4iem9B"
    
            # 1、往app接口发送请求
            response = get_page(url)
            # print(response.text)
            print('*' * 1000)
            # 反序列化为字典
            data = response.json()
    
            # 获取接口中app标签数据
            app_li = data['data']['content']
            # print(app_li)
            # 2、解析app标签数据
            parse_index(app_li)
    
    
    if __name__ == '__main__':
        main()

     2.导入数据

    # 前端后端分离,从前端的接口里向后端拿数据
    # 并且成功将数据插入其中
    '''
    AJAX请求
    Request URL: https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=3&ctoken=-HP5XfiRGS5Q6MuHAoOJX69D
    Request URL: https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=2&ctoken=-HP5XfiRGS5Q6MuHAoOJX69D
    爬到page=32
    首先获取到每一个标签,然后在每一个标签中获取想要的数据
    '''
    import requests
    from bs4 import BeautifulSoup
    from pymongo import MongoClient
    client=MongoClient('localhost',27017)
    index_col=client['wandoujia']['index']
    detail_col=client['wandoujia']['detail']
    def get_page(url):
        response=requests.get(url)
        return  response
    # 解析详情页
    def parse_detail(text):
        soup = BeautifulSoup(text, 'lxml')
        try:
            name=soup.find(name="span",attrs={"class":"title"}).text
        except Exception:
            name=None
            # print(name)
        try:
            love=soup.find(name="span",attrs={"class":"love"}).text
        except Exception:
            love=None
            # print(love)
        try:
            commit_num=soup.find(name='a',attrs={"class":"comment-open"}).text
        except Exception:
            commit_num=None
            # print(commit_num)
        try:
            commit_content=soup.find(name="div",attrs={"class":"con"}).text
        except Exception:
            commit_content = None
            # print(commit_content)
        try:
            download_url=soup.find(name='a',attrs={"class":"normal-dl-btn"}).attrs['href']
        except Exception:
            commit_content = None
           # print(download_url)
        if name and love and commit_num and commit_content and download_url:
            detail_data={
                'name':name,
                'love':love,
                'commit_num':commit_num,
                'commit_content':commit_content,
                'download_url':download_url
            }
        if not love:
            detail_data={
                'name': name,
                'love': '没人点赞',
                'commit_num': commit_num,
                'commit_content': commit_content,
                'download_url': download_url
            }
        if not download_url:
            detail_data={
                'name': name,
                'love': love,
                'commit_num': commit_num,
                'commit_content': commit_content,
                'download_url': '没有安装包'
            }
        detail_col.insert(detail_data)
        print(f'{name}app数据插入成功!')
        # print(
        #     f'''
        #     ============= tank ==============
        #     app名称:{name}
        #     好评率: {love}
        #     评论数: {commit_num}
        #     小编点评: {commit_content}
        #     app下载链接: {download_url}
        #     ============= end ==============
        #     '''
        # )
    def parse_index(data):
        soup = BeautifulSoup(data, 'lxml')
        app_list=soup.find_all(name='li',attrs={"class":"card"})
        for app in app_list:
            # print("*"*100)
            # print(app)
            # 图标地址
            # 获取第一个img标签中的data-original属性
            img=app.find(name='img').attrs['data-original']
            # print(img)
            # 下载次数
            # 获取标签再获取里面的文本,通过.text来获
            down_num=app.find(name='span',attrs={"class":"install-count"}).text
            # print(down_num)
            import re
            # 大小
            # 根据文本正则获取到文本中包含数字+MB(d代表数字)的span标签中的文本
            size=soup.find(name='span',text=re.compile("d+MB")).text
            # print(size)
            # 详情页中的url
            # 获取class为detail-check-btn的a标签中的href属性
            detail_url=app.find(name='a').attrs['href']
            #  print(detail_url)
            # 插入数据
            index_data={
                'img':img,
                'down_num':down_num,
                'size':size,
                'detail_url':detail_url
            }
            index_col.insert(index_data)
            print('主页数据插入成功!')
            # 3.往app详情页发送请求
            response=get_page(detail_url)
            # 4.解析详情页
            parse_detail(response.text)
    def main():
        for line in range(1,2):
            url=f"https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page={line}&ctoken=-HP5XfiRGS5Q6MuHAoOJX69D"
            # 往app接口发送请求
            response=get_page(url)
            # print(response.text)
            print('*'*1000)
            # json数据中一定是双引号
            #print(response.json())# 装换为字典
            data=response.json()
            #获取接口中app中标签数据
            app_li=data['data']['content']
            # print(app_li)
            parse_index(app_li)
            # 执行完所有之后关闭芒果客户端
            client.close()
    if __name__ == '__main__':
        main()
  • 相关阅读:
    从客户端检测到有潜在危险的Request.Form值
    IE6,IE7,FF等浏览器不兼容原因及解决办法
    C#代码与javaScript函数的相互调用
    Asp.net 导出Excel 和Word
    JS取得RadioButtonList的Value,Text及选中值等信息
    VS2005+SQL2005 ASP.NET2.0数据库连接
    蛮好蛮使用的登陆界面
    C#.NET防止SQL注入式攻击
    Asp.net中防止用户多次登录的方法
    集合初始化器
  • 原文地址:https://www.cnblogs.com/lhhhha/p/11062171.html
Copyright © 2011-2022 走看看