zoukankan      html  css  js  c++  java
  • 抓取天猫手机评论

    import re
    import json
    import time
    import requests
    from bs4 import BeautifulSoup 
     
     
    tm_headers = { 
                "scheme": "https",
                "Connection": "keep-alive",
                "Upgrade-Insecure-Requests": "1",
                "Cache-Control" : "max-age=0",
                "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36",
                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
                "Accept-Encoding": "gzip, deflate, br",
                "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
                "Content-Type": "text/html"
                   
            }   
     
    def req(url, headers):
        soup = None
        try:
            content = requests.get(url, headers=headers, timeout=2)
            code = content.status_code
            if code == 200:
                soup = BeautifulSoup(content.text, "html.parser")
        except Exception as e:
            print("get url error, url: {0}".format(url))
        return soup
    
    def get_phone_list():
        #  获取列表url
        phone_list = []
        list_url = "https://shouji.tmall.com/?spm=a222t.8063993.a2226c3nav.5.7b8f4da0yjyxC3&acm=lb-zebra-155904-807029.1003.4.767290&scm=1003.4.lb-zebra-155904-807029.OTHER_14592967254716_767290#J_floor12"
        soup = req(list_url, tm_headers)
        txt = soup.find_all("li", class_="focus-")
        for i in txt[:-5]:
            a = i.find("a")
            name = i.find("h3").get_text()
            href = a.get("href")
            if name != "":
                itemid = href.split("id=")[-1].split("&")[0]
                phone_list.append({"url": "https:" + href, "name": name})
        return phone_list
    
    def create_deltail_url(url, page=1, itemid=None, sellerid=None):
        # 生成评论地址,最终发现获取评论api 参数需要两个id,itemid and sellerid,sellerid 必须去详情页拿
        if itemid is None and sellerid is None: 
    itemid
    = url.split("id=")[-1].split("&")[0]
    soup
    = req(url, tm_headers)
    txt
    = soup.find_all("meta")[-1].get("content")
    sellerid
    = txt.split("userid=")[-1].replace(";", "")
    comment_json_url
    = "https://rate.tmall.com/list_detail_rate.htm?itemId={0}&sellerId={1}&currentPage={2}".format(itemid, sellerid, page) return comment_json_url, itemid, sellerid def get_deltail(db, comment_json_url, itemid, sellerid, name): # 调用评论接口 获取评论数据 pagenum = None comment_data = req(comment_json_url, tm_headers) if comment_data is not None: count = 1 while "paginator" not in str(comment_data) and count < 5: comment_data = req(comment_json_url, tm_headers) count += 1 time.sleep(1) try: comment_str = str(comment_data)[15:] comment_json = json.loads(comment_str) except Exception as e: return None rateList = comment_json["rateList"] for item in rateList: data = {} data["itemid"] = itemid data["usernick"] = item["displayUserNick"] data["comment_content"] = item["rateContent"] data["comment_date"] = item["rateDate"] data["sellerid"] = sellerid # insert db pagenum = comment_json["paginator"]["lastPage"] return pagenum if __name__ == "__main__": phone_list = get_phone_list() for phone_url in phone_list: name = phone_url["name"] url = phone_url["url"] print("开始抓取: {0} 手机, 页码: {1}".format(name, 1)) comment_json_url, itemid, sellerid = create_deltail_url(url) pagenum = get_deltail(db, comment_json_url, itemid, sellerid, name) if pagenum is not None: page = 2 while page < pagenum: print("开始抓取: {0} 手机, 页码: {1}".format(name, page)) comment_json_url, itemid, sellerid = create_deltail_url(phone_url["url"], page, itemid, sellerid) get_deltail(db, comment_json_url, itemid, sellerid, name) page += 1 time.sleep(2)

      教程仅供技术研究学习使用,若有侵权,联系本人删除

  • 相关阅读:
    【poj2396】 Budget
    【bzoj3876】 Ahoi2014—支线剧情
    【uoj207】 共价大爷游长沙
    【bzoj3064】 CPU监控
    【codeforces 103E】 Buying Sets
    【bzoj3938】 Robot
    【bzoj1568】 JSOI2008—Blue Mary开公司
    【hdu5306】 Gorgeous Sequence
    【bzoj2229】 Zjoi2011—最小割
    【bzoj2007】 Noi2010—海拔
  • 原文地址:https://www.cnblogs.com/dockers/p/7767914.html
Copyright © 2011-2022 走看看