zoukankan      html  css  js  c++  java
  • 电商 生意参谋 抓取 访客数据 Python版

    import requests
    import json
    import pandas as pd
    import time
    
    
    # getRtVisitor.json
    
    session = requests.Session()  # 创建一个session对象
    headers = {
    'accept':'*/*',
    'accept-encoding':'gzip, deflate, br',
    'accept-language':'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7',
    'cookie':'t=4a09a3c800e9591a2c198b124d80e02a; cna=G2veFsG5YX0CAQ5pIwfTWHFB; lgc=%5Cu5927%5Cu5468%5Cu4F73; tracknick=%5Cu5927%5Cu5468%5Cu4F73; tg=0; _euacm_ac_l_uid_=1639181234; 1639181234_euacm_ac_c_uid_=1639181234; 1639181234_euacm_ac_rs_uid_=1639181234; _portal_version_=new; cc_gray=1; thw=cn; mt=ci=84_1; _euacm_ac_rs_sid_=155550734; enc=Rk6EFG1Zi%2F5sBqHEJADmGhcHg%2F8HVbpOVQDb72MK8zmamaAE7C23fINFdlW5BT%2FNfkRKZLPkB8gKw%2Bj0cK0hig%3D%3D; hng=CN%7Czh-CN%7CCNY%7C156; cookie2=1a5b17269a2a570500957e7dcce65c45; _tb_token_=35e1e66785eb7; _samesite_flag_=true; sgcookie=Ep88649ag1i3VPa4P6akw; unb=1639181234; uc3=vt3=F8dBxd9nWWLAbiRWGZE%3D&lg2=UtASsssmOIJ0bQ%3D%3D&nk2=1z8nwQRn&id2=Uoe3fo1fFq44Zg%3D%3D; csg=aa9633c3; cookie17=Uoe3fo1fFq44Zg%3D%3D; dnk=%5Cu5927%5Cu5468%5Cu4F73; skt=69bf89596c5fe0d1; existShop=MTU4NDU4MTc0MQ%3D%3D; uc4=nk4=0%401fDckZcjfHBEZVI1NQCO3RY%3D&id4=0%40UO%2BxIxkhtoiLbBQVUeEnpSn1KHbH; _cc_=U%2BGCWk%2F7og%3D%3D; _l_g_=Ug%3D%3D; sg=%E4%BD%B34e; _nk_=%5Cu5927%5Cu5468%5Cu4F73; cookie1=BYXJ7V2Aq8c%2FnceEFKLACXxZ7qw2VwJYwgQbxY%2Feb6A%3D; tfstk=cKiNBsfOWhKwV3THPur4G7Z13THOZAHinMy7S4xicwaWPJ4GiQfYKvNUL-NJxPf..; v=0; _m_h5_tk=8b49ef5c39700daef25a92f8c520cc65_1584590383214; _m_h5_tk_enc=4d9374ef7e4743b408cb6247793448be; uc1=cookie14=UoTUPvXUU0lRTg%3D%3D&lng=zh_CN&cookie16=Vq8l%2BKCLySLZMFWHxqs8fwqnEw%3D%3D&existShop=true&cookie21=UtASsssmfavZrexPkAwn7A%3D%3D&tag=8&cookie15=W5iHLLyFOGW7aA%3D%3D&pas=0; XSRF-TOKEN=673a0ad9-3e96-4fb2-bbd9-af3ffa50a9bc; JSESSIONID=B0A57AC70F717D1231606D512F674ECB; l=dBjCTtrcQuukdAdWBOfgqK_ahxbOrIRbzsPPlS9CCICP9O1wrJp1WZ4V8-8eCnGN36J6R3WhGKB3BqLTFPaOhtikBBrsDOsCydTBR; isg=BAoK5gRhYDd8MOy0NcE2jJ5BW_Cs-45VYmG9hZRDct30R6gBfIkRZ2NxV7ubtwbt',
    'referer':'https://sycm.taobao.com/ipoll/visitor.htm?spm=a21ag.7622617.LeftMenu.d181.758a1be9MfheaI',
    'sec-fetch-mode':'cors',
    'sec-fetch-site':'same-origin',
    'user-agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36'
    }
    
    
    def loadPage(page): 
        strPage=str(page)   
        strToken='6f5921e86'
        strLastPara='1584581855701'
        url = 'https://sycm.taobao.com/ipoll/live/visitor/getRtVisitor.json?device=2&limit=20&page='+strPage+'&token='+strToken+'&type=Y&_='+strLastPara
        # https://sycm.taobao.com/ipoll/live/visitor/getRtVisitor.json?device=2&limit=20&page=2&token=97890b4c6&type=Y&_=1584409617907
        req = session.get(url, headers=headers)  #发起get请求
        text = req.text    
        with open(str(page)+".txt", "w") as f:
            f.write(text)
        return json.loads(text)    
    
    
    def GetValueFromDic(item, key, defaultValue=None):
        if key in item:
            return item[key]
        if defaultValue is None:
            return '' 
        return defaultValue
    
    
    def appendList(json_res):
        list_res = json_res['data']['data']['list']
        for item in list_res:
            arr=[]
            if item['pageTypeId'] == 5:# 商品分类页
                arr.append(item['visitTime'])
                arr.append('其它来源')
                arr.append('')# 搜索关键字
                arr.append('商品分类页')
                arr.append('')# 访客位置
                arr.append(item['oid'])
            else:
                arr.append(GetValueFromDic(item, 'visitTime'))              
                arr.append(GetValueFromDic(item, 'srcGrpName', '其它来源'))              
                arr.append(GetValueFromDic(item, 'preSeKeyword'))
                arr.append(GetValueFromDic(item, 'title'))            
                arr.append(GetValueFromDic(item, 'cityName'))
                arr.append(GetValueFromDic(item, 'oid'))   
            list_excel.append(arr)   
    
    
    # list集合
    list_excel=[]
    
    # 数据源
    json_res = loadPage(1)
    
    # 解析数据
    totalPage = json_res['data']['data']['totalPage']
    curPage = json_res['data']['data']['page']
    print('总页数:', totalPage, '当前页:', curPage)
    
    # 追加数据
    appendList(json_res)
    
    # 循环下载
    for i in range(totalPage-1):
        page = i + 2
        print('当前页:', page)
        json_res = loadPage(page)
        appendList(json_res)
    
    # 转换为DataFrame
    #df = pd.DataFrame(list_excel, columns=['visitTime', 'srcGrpName', 'preSeKeyword', 'title', 'cityName', 'oid'])
    df = pd.DataFrame(list_excel, columns=['访问时间', '入店来源', '搜索关键字', '被访页面', '访客位置', '访客编号'])
    
    
    # 保存到本地excel
    curTime = time.strftime('%Y-%m-%d %H%M%S',time.localtime(time.time()))
    df.to_excel('访客-'+curTime+'.xlsx', index=False)
    
    

    mark

    pageTypeId:访客访问页面的类型
    具体看这里:item['pageTypeId'] == 5:# 商品分类页
    pageTypeId==5,表示用户是从商品分类页来的,或者点了商品分类页页

  • 相关阅读:
    zmap zgrab 环境搭建
    RF是如何工作的?
    RF的优缺点
    国内NLP的那些人那些会
    B-、B+、B*树
    关于LDA的gibbs采样,为什么可以获得正确的样本?
    LDA算法里面Dirichlet分布的两个参数alpha和beta怎样确定?
    如何确定LDA的主题个数
    SMO算法精解
    奇异值与主成分分析(PCA)
  • 原文地址:https://www.cnblogs.com/guxingy/p/12932364.html
Copyright © 2011-2022 走看看