苏宁爬虫基本完成
import requests
import re
import time
import mysql
import json
def getitem(keyword, n):
itemurl = "https://search.suning.com/emall/searchV1Product.do?keyword=" + keyword + "&pg=01&cp=" + "0" + "&paging=" + str(n)
print("itemurl:"+itemurl)
response = requests.get(itemurl, headers=headers).text.replace("|||||", ",")
# print(response)
# 商品的prdid 和 shopid
getID = re.compile(r'<span class="def-price" datasku="(.*?),(.*?)" brand_id=".*?" mdmGroupId=".*?">')
IDlist = re.findall(getID, response)
# 商品的图片和描述
getDetail = re.compile(r'<img alt="(.*?)" src="(.*?)" picPriority=".*?">')
dList = re.findall(getDetail, response)
# 商品的评价数
# getComment = re.compile(r'<i>(.*?)</i>评价</a>')
# cList = re.findall(getComment, response)
i = 0
print("长度:"+str(len(IDlist)))
item = []
for key in IDlist:
print("——————————————————————第"+str(i+1)+"件商品——————————————————————")
print("描述:"+dList[i][0]+"
图片:"+dList[i][1])
name = dList[i][0]
image = dList[i][1]
# if cList:
# print("评价:"+cList[i])
# else:
# print("评价:暂无")
i = i+1
# getprice(key[0], key[1])
durl = "https://product.suning.com/" + key[1] + "/" + key[0] + ".html"
print("商品网址:" + durl)
link = durl
shopid2 = int(key[1])
pjurl = "https://product.suning.com/pds-web/ajax/getApiRemoteMap_" + str(shopid2) + "_shopScoreCallback.html?"
# print(pjurl)
response2 = requests.get(pjurl, headers=headers).text.replace("\", "")
getpj = re.compile(
r'{"parentIndexName":"评价","parentIndexScore":"(.*?)","serParentIndexCateCon":".*?","parentIndexCode":".*?"},{"parentIndexName":"物流","parentIndexScore":"(.*?)","serParentIndexCateCon":".*?","parentIndexCode":".*?"},{"parentIndexName":"售后","parentIndexScore":"(.*?)"')
pjlist = re.findall(getpj, response2)
if pjlist:
print("店铺评分:" + str(pjlist[0][0]))
print("物流评分:" + str(pjlist[0][1]))
print("售后评分:" + str(pjlist[0][2]))
else:
print("评分:暂无店铺各项评分")
response3 = requests.get(durl, headers=headers).text
getNum = re.compile(r'<span>货号</span> </div> </td> <td class="val">(.*?)</td>')
numlist = re.findall(getNum, response3)
id = ""
if numlist:
id = str(numlist[0])
print("货号:" + str(numlist[0]))
else:
id = "暂无"
print("货号:暂无此商品货号")
getShop = re.compile(r'<a id="chead_indexUrl" href="(.*?)" title="(.*?)">')
shopList = re.findall(getShop, response3)
shopI = ""
shopH = ""
if shopList:
shopI = shopList[0][1]
shopH = shopList[0][0]
print("店铺:"+shopI+" 店铺链接:"+shopH)
else:
shopI = "暂无"
shopH = "暂无"
print("店铺:" + "暂无" + " 店铺链接:" + "暂无")
pjurl = "https://review.suning.com/ajax/getClusterReview_labels/style--0000000"+key[0]+"-"+key[1]+"-----commodityrLabels.htm?"
# print(pjurl)
reponse4 = requests.get(pjurl, headers=headers).text.replace('commodityrLabels(', '').replace('})', '}')
# print(reponse4)
if reponse4:
d = json.loads(reponse4)
print("评论关键字个数:"+str(len(d["commodityLabelCountList"])))
for q in d["commodityLabelCountList"]:
print("label:"+q["labelName"]+" num:"+str(q["labelCnt"]))
else:
print("无")
priceurl = " https://pas.suning.com/nspcsale_0_0000000" + key[0] + "_0000000" + key[0] + "_" + key[1] + "_60_311_3110199_20089_1000095_9095_10638_Z001___R1901001_0.36_0___000060021____0___448.224_2_01_20002_20006__.html?"
# print(priceurl)
res = requests.get(priceurl, headers=headers).text
# print(res)
getK = re.compile(r'"gbPrice":"(.*?)"')
keyL = re.findall(getK, res)
price = 0.0
if keyL:
price = str(keyL[0])
print("价格:" + str(keyL[0]))
else :
getK = re.compile(r'"netPrice":"(.*?)"')
keyL = re.findall(getK, res)
if keyL:
price = str(keyL[0])
print("价格:" + str(keyL[0]))
else:
price = "无"
print("无价格:")
orgin = "苏宁"
item.append([time.strftime("%Y-%m-%d"), id, price, name, link, image, orgin])
print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
mysql.insert_item(item)
# if not mysql.get_conn()[1]:
# mysql.insert_item(item)
# else:
# mysql.update_item(item)
# def getprice(prdid, shopid):
# durl = "https://product.suning.com/"+shopid+"/"+prdid+".html"
# print("商品网址:"+durl)
#
# shopid2 = int(shopid)
# pjurl = "https://product.suning.com/pds-web/ajax/getApiRemoteMap_"+str(shopid2)+"_shopScoreCallback.html?"
# # print(pjurl)
# response2 = requests.get(pjurl).text.replace("\", "")
# getpj = re.compile(r'{"parentIndexName":"评价","parentIndexScore":"(.*?)","serParentIndexCateCon":".*?","parentIndexCode":".*?"},{"parentIndexName":"物流","parentIndexScore":"(.*?)","serParentIndexCateCon":".*?","parentIndexCode":".*?"},{"parentIndexName":"售后","parentIndexScore":"(.*?)"')
# pjlist = re.findall(getpj, response2)
# if pjlist:
# print("店铺评分:"+str(pjlist[0][0]))
# print("物流评分:"+str(pjlist[0][1]))
# print("售后评分:"+str(pjlist[0][2]))
# else:
# print("评分:暂无店铺各项评分")
#
# response3 = requests.get(durl).text
# getNum = re.compile(r'<span>货号</span> </div> </td> <td class="val">(.*?)</td>')
# numlist = re.findall(getNum, response3)
# if numlist:
# print("货号:"+str(numlist[0]))
# else:
# print("货号:暂无此商品货号")
#
# priceurl = " https://pas.suning.com/nspcsale_0_0000000"+prdid+"_0000000"+prdid+"_"+shopid+"_60_311_3110199_20089_1000095_9095_10638_Z001___R1901001_0.36_0___000060021____0___448.224_2_01_20002_20006__.html?"
# res = requests.get(priceurl).text
# # print(res)
# getK = re.compile(r'"gbPrice":"(.*?)"')
# keyL = re.findall(getK, res)
# if keyL:
# print("价格:"+str(keyL[0]))
# else:
# getK = re.compile(r'"netPrice":"(.*?)"')
# keyL = re.findall(getK, res)
# print("价格:"+str(keyL[0]))
if __name__ == '__main__':
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36"
}
key = {"手机", "运营商", "智能数码", "家用电器", "帮客", "房产", "厨卫", "生活家电", "厨具", "电脑办公", "相机", "电竞", "家具", "家装", "家纺", "灯具",
"食品", "酒水", "生鲜", "特产", "美妆", "个护", "清洁", "宠物", "母婴", "玩具", "车床", "童装", "运动", "户外", "国米", "骑行", "女装", "男装",
"内衣", "鞋靴", "箱包", "钟表", "珠宝", "艺术", "汽车", "电摩", "汽车用品", "图书", "艺术", "原版", "文学", "医药健康", "计生情趣", "理财", "分期",
"便民"}
for i in range(200):
getitem("外套", i)