zoukankan      html  css  js  c++  java
  • 个人冲刺(七)

    苏宁爬虫基本完成
    import requests
    import re
    import time
    import mysql
    import json


    def getitem(keyword, n):
    itemurl = "https://search.suning.com/emall/searchV1Product.do?keyword=" + keyword + "&pg=01&cp=" + "0" + "&paging=" + str(n)
    print("itemurl:"+itemurl)
    response = requests.get(itemurl, headers=headers).text.replace("|||||", ",")
    # print(response)
    # 商品的prdid 和 shopid
    getID = re.compile(r'<span class="def-price" datasku="(.*?),(.*?)" brand_id=".*?" mdmGroupId=".*?">')
    IDlist = re.findall(getID, response)

    # 商品的图片和描述
    getDetail = re.compile(r'<img alt="(.*?)" src="(.*?)" picPriority=".*?">')
    dList = re.findall(getDetail, response)

    # 商品的评价数
    # getComment = re.compile(r'<i>(.*?)</i>评价</a>')
    # cList = re.findall(getComment, response)

    i = 0
    print("长度:"+str(len(IDlist)))
    item = []
    for key in IDlist:
    print("——————————————————————第"+str(i+1)+"件商品——————————————————————")
    print("描述:"+dList[i][0]+" 图片:"+dList[i][1])

    name = dList[i][0]
    image = dList[i][1]

    # if cList:
    # print("评价:"+cList[i])
    # else:
    # print("评价:暂无")
    i = i+1
    # getprice(key[0], key[1])
    durl = "https://product.suning.com/" + key[1] + "/" + key[0] + ".html"
    print("商品网址:" + durl)

    link = durl

    shopid2 = int(key[1])
    pjurl = "https://product.suning.com/pds-web/ajax/getApiRemoteMap_" + str(shopid2) + "_shopScoreCallback.html?"
    # print(pjurl)
    response2 = requests.get(pjurl, headers=headers).text.replace("\", "")
    getpj = re.compile(
    r'{"parentIndexName":"评价","parentIndexScore":"(.*?)","serParentIndexCateCon":".*?","parentIndexCode":".*?"},{"parentIndexName":"物流","parentIndexScore":"(.*?)","serParentIndexCateCon":".*?","parentIndexCode":".*?"},{"parentIndexName":"售后","parentIndexScore":"(.*?)"')
    pjlist = re.findall(getpj, response2)
    if pjlist:
    print("店铺评分:" + str(pjlist[0][0]))
    print("物流评分:" + str(pjlist[0][1]))
    print("售后评分:" + str(pjlist[0][2]))
    else:
    print("评分:暂无店铺各项评分")

    response3 = requests.get(durl, headers=headers).text
    getNum = re.compile(r'<span>货号</span> </div> </td> <td class="val">(.*?)</td>')
    numlist = re.findall(getNum, response3)
    id = ""
    if numlist:
    id = str(numlist[0])
    print("货号:" + str(numlist[0]))
    else:
    id = "暂无"
    print("货号:暂无此商品货号")

    getShop = re.compile(r'<a id="chead_indexUrl" href="(.*?)" title="(.*?)">')
    shopList = re.findall(getShop, response3)
    shopI = ""
    shopH = ""
    if shopList:
    shopI = shopList[0][1]
    shopH = shopList[0][0]
    print("店铺:"+shopI+" 店铺链接:"+shopH)
    else:
    shopI = "暂无"
    shopH = "暂无"
    print("店铺:" + "暂无" + " 店铺链接:" + "暂无")

    pjurl = "https://review.suning.com/ajax/getClusterReview_labels/style--0000000"+key[0]+"-"+key[1]+"-----commodityrLabels.htm?"
    # print(pjurl)
    reponse4 = requests.get(pjurl, headers=headers).text.replace('commodityrLabels(', '').replace('})', '}')

    # print(reponse4)
    if reponse4:
    d = json.loads(reponse4)
    print("评论关键字个数:"+str(len(d["commodityLabelCountList"])))
    for q in d["commodityLabelCountList"]:
    print("label:"+q["labelName"]+" num:"+str(q["labelCnt"]))
    else:
    print("无")
    priceurl = " https://pas.suning.com/nspcsale_0_0000000" + key[0] + "_0000000" + key[0] + "_" + key[1] + "_60_311_3110199_20089_1000095_9095_10638_Z001___R1901001_0.36_0___000060021____0___448.224_2_01_20002_20006__.html?"
    # print(priceurl)
    res = requests.get(priceurl, headers=headers).text
    # print(res)
    getK = re.compile(r'"gbPrice":"(.*?)"')
    keyL = re.findall(getK, res)
    price = 0.0
    if keyL:
    price = str(keyL[0])
    print("价格:" + str(keyL[0]))
    else :
    getK = re.compile(r'"netPrice":"(.*?)"')
    keyL = re.findall(getK, res)
    if keyL:
    price = str(keyL[0])
    print("价格:" + str(keyL[0]))
    else:
    price = "无"
    print("无价格:")
    orgin = "苏宁"
    item.append([time.strftime("%Y-%m-%d"), id, price, name, link, image, orgin])
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    mysql.insert_item(item)
    # if not mysql.get_conn()[1]:
    # mysql.insert_item(item)
    # else:
    # mysql.update_item(item)


    # def getprice(prdid, shopid):
    # durl = "https://product.suning.com/"+shopid+"/"+prdid+".html"
    # print("商品网址:"+durl)
    #
    # shopid2 = int(shopid)
    # pjurl = "https://product.suning.com/pds-web/ajax/getApiRemoteMap_"+str(shopid2)+"_shopScoreCallback.html?"
    # # print(pjurl)
    # response2 = requests.get(pjurl).text.replace("\", "")
    # getpj = re.compile(r'{"parentIndexName":"评价","parentIndexScore":"(.*?)","serParentIndexCateCon":".*?","parentIndexCode":".*?"},{"parentIndexName":"物流","parentIndexScore":"(.*?)","serParentIndexCateCon":".*?","parentIndexCode":".*?"},{"parentIndexName":"售后","parentIndexScore":"(.*?)"')
    # pjlist = re.findall(getpj, response2)
    # if pjlist:
    # print("店铺评分:"+str(pjlist[0][0]))
    # print("物流评分:"+str(pjlist[0][1]))
    # print("售后评分:"+str(pjlist[0][2]))
    # else:
    # print("评分:暂无店铺各项评分")
    #
    # response3 = requests.get(durl).text
    # getNum = re.compile(r'<span>货号</span> </div> </td> <td class="val">(.*?)</td>')
    # numlist = re.findall(getNum, response3)
    # if numlist:
    # print("货号:"+str(numlist[0]))
    # else:
    # print("货号:暂无此商品货号")
    #
    # priceurl = " https://pas.suning.com/nspcsale_0_0000000"+prdid+"_0000000"+prdid+"_"+shopid+"_60_311_3110199_20089_1000095_9095_10638_Z001___R1901001_0.36_0___000060021____0___448.224_2_01_20002_20006__.html?"
    # res = requests.get(priceurl).text
    # # print(res)
    # getK = re.compile(r'"gbPrice":"(.*?)"')
    # keyL = re.findall(getK, res)
    # if keyL:
    # print("价格:"+str(keyL[0]))
    # else:
    # getK = re.compile(r'"netPrice":"(.*?)"')
    # keyL = re.findall(getK, res)
    # print("价格:"+str(keyL[0]))


    if __name__ == '__main__':
    headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36"
    }
    key = {"手机", "运营商", "智能数码", "家用电器", "帮客", "房产", "厨卫", "生活家电", "厨具", "电脑办公", "相机", "电竞", "家具", "家装", "家纺", "灯具",
    "食品", "酒水", "生鲜", "特产", "美妆", "个护", "清洁", "宠物", "母婴", "玩具", "车床", "童装", "运动", "户外", "国米", "骑行", "女装", "男装",
    "内衣", "鞋靴", "箱包", "钟表", "珠宝", "艺术", "汽车", "电摩", "汽车用品", "图书", "艺术", "原版", "文学", "医药健康", "计生情趣", "理财", "分期",
    "便民"}
    for i in range(200):
    getitem("外套", i)
  • 相关阅读:
    mysql 全量备份和增量备份
    mysql 修改密码提示ERROR 1819 (HY000): Your password does not satisfy the current policy requirements
    MHA 常见问题解决
    MHA 数据库高可用+ GTID 同步测试部署
    通过电脑抓手机端log
    用js递归遍历树结构
    js实现全屏
    使表格随着内容自适应宽度
    POST请求
    vue中组件通信
  • 原文地址:https://www.cnblogs.com/mumulailai/p/14911983.html
Copyright © 2011-2022 走看看