zoukankan      html  css  js  c++  java
  • 爬虫之亚马逊爬取

    根据mazon函数里的参数来,爬取相关的书籍,并以json来存储

    import requests
    import re
    import random
    import json
    from bs4 import BeautifulSoup
    import pickle
    dic = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.4295.400 QQBrowser/9.7.12661.400'
    useagent = {'User-Agent':dic,'Host':'www.amazon.cn','Cookie':'x-wl-uid=1+EeiKz9a/J/y3g6XfXTnSbHAItJEus3oQ6Gz+T/haur7dZfkNIgoxzMGwviB+42iWIyk9LR+iHQ=;'
                                                                 ' session-id=457-2693740-8878563; ubid-acbcn=459-5133849-3255047; lc-acbcn=zh_CN; i18n-prefs=CNY; '
                                                                 'session-token="8n/Oi/dUCiI9zc/0zDLjB9FQRC6sce2+Tl7F0oXncOcIYDK4SEJ7eek/Vs3UfwsRchW459OZni0AFjMW+'
                                                                 '9xMMBPSLM8MxLNDPP1/13unryj8aiRIZAE1WAn6GaeAgauNsijuBKKUwwLh8Dba7hYEjwlI1J6xlW0LKkkyVuApjRXnOsvdYr'
                                                                 'X8IURVpOxDBnuAF9r7O71d/NPkIQsHy7YCCw=="; session-id-time=2082787201l;'
                                                                 ' csm-hit=tb:s-85XYJNXFEJ5NBKR0JE6H|1566558845671&t:1566558845672&adb:adblk_no'}
    
    def mazon(text,type=''):
        if type!='':
            type = '&i='+type
        cookies = dict(useid = '123456',token = 'funkystyle')
        responsts = requests.get(f'https://www.amazon.cn/s?k={text}{type}&__mk_zh_CN=亚马逊网站&ref=nb_sb_noss',headers=useagent,cookies=cookies)
    
        responsts.encoding = responsts.apparent_encoding
        index = responsts.text
        buti = BeautifulSoup(index,'html.parser')
        # print(buti.prettify())
        if responsts.status_code==200:
            page = re.findall('class="a-disabled">(d+)</li>',index)
            for i in range(1,int(page[-1])+1):
                rand =random.randint(1560000000,1570000000)
                url = f'https://www.amazon.cn/s?k={text}{type}&page={i}&__mk_zh_CN=亚马逊网站&qid={rand}&ref=sr_pg_{i}'
                responst = requests.get(url,headers=useagent)
                responst.encoding = responst.apparent_encoding
                if responst.status_code!=200:
                    print(f'运行到第{i}页请求失败')
                    break
                content = responst.text
                goodslist = re.findall(f'<a class="a-link-normal a-text-normal" target="_blank" href="(.*?)ref=',content)
                for j,goods in enumerate(goodslist):
                    goodsurl = f'https://www.amazon.cn/{goods}'
                    res = requests.get(goodsurl, headers=useagent)
                    res.encoding = res.apparent_encoding
                    if res.status_code != 200:
                        print(f'运行到{i}失败')
                    cont = res.text
                    title = re.findall('<span id="ebooksProductTitle" class="a-size-extra-large">(.*?)</span>',cont,re.S)
                    title = re.sub('s+|&.*?;','','-'.join(title))
                    auther = re.findall('<span class="author notFaded".*?href=.*?>(.*?)</a>',cont,re.S)
                    price = re.findall('<span class="a-size-base a-color-price a-color-price">(.*?)</span>',cont,re.S)
                    price = re.findall('S+','-'.join(price))
                    dic_infor = {'主题':title,'作者':auther,'价格':price}
                    with open(f'第{i}页商品{j+1}.json','at',encoding='utf8') as fa:
                        json.dump(dic_infor,fa)
                        fa.flush()
                # with open(f'{i}.txt','wt',encoding='utf8') as fw:
                #     fw.write()
        else:
            print('首页访问失败!')
    
        # responsts.raise_for_status()
    
    
    mazon('python')
    # with open('第2页商品.json','rt',encoding='gbk') as fr:
    #     data = json.load(fr)
    # print(data)
    
  • 相关阅读:
    xml转换为json格式时,如何将指定节点转换成数组 Json.NET
    快速删除C#代码中的空白行
    C#编程中的Image/Bitmap与base64的转换及 Base-64 字符数组或字符串的长度无效问题 解决
    Flash设置(各种版本浏览器包括低版本IE)
    使用vcastr22.swf做flash版网页视频播放器
    使用VLC Activex插件做网页版视频播放器
    web项目 在visual studio 输出窗口显示调试信息
    geos 3.6.3库windows版本 已编译完成的32位版本和64位版本
    vs2017 打开附带的localdb v13
    visual studio code 里调试运行 Python代码
  • 原文地址:https://www.cnblogs.com/cheng825/p/11419324.html
Copyright © 2011-2022 走看看