zoukankan      html  css  js  c++  java
  • Python爬取淘宝商品信息写入mysql

    直接上代码:(商品名称、单价、图片链接)

    import pymysql
    import requests
    import re
    
    
    def getHTMLText(url):
        kv = {'cookie':'thw=cn; hng=CN%7Czh-CN%7CCNY%7C156; enc=ooWAQ8HPiBkBlDgWaQ2BoQXFD4cHXejeOP0Nq7xvbCuGN5yubT%2ByBjrb2j417KSrQkoR9YQxMFoqYufejy7Hlw%3D%3D; _m_h5_tk=9cc0be22588c97655e9e0ed031f29703_1589472803622; _m_h5_tk_enc=8fd3fcd9077f0f17bcb2dc4f9d593617; cookie2=1a0da2cc9535ebe4f7bd2787bebb9da1; t=0a472589a79eda4e33e9b072e3446525; _tb_token_=e136e0330e37e; alitrackid=www.taobao.com; _samesite_flag_=true; cna=cAVGFNpuDkUCAXkcRV0prgNa; sgcookie=EGxNSorLw1t5Dg21WTFJw; unb=3361002229; uc3=nk2=qA%2Fo8e0UjX1l%2BUs%3D&lg2=UtASsssmOIJ0bQ%3D%3D&id2=UNN78Eg15kheYA%3D%3D&vt3=F8dBxGZobO%2BfXgtBG40%3D; csg=228d2d4a; lgc=%5Cu5E05%5Cu6C14%5Cu5CF0happy; cookie17=UNN78Eg15kheYA%3D%3D; dnk=%5Cu5E05%5Cu6C14%5Cu5CF0happy; skt=aa63ca1a4a6e356c; existShop=MTU4OTYzNTEyOQ%3D%3D; uc4=nk4=0%40qjS8tzpCQQHfNZapqmDNrmd4%2F2Dhnw%3D%3D&id4=0%40UgQz06zOiEqwpAtViK7HqZlIKslx; tracknick=%5Cu5E05%5Cu6C14%5Cu5CF0happy; _cc_=U%2BGCWk%2F7og%3D%3D; _l_g_=Ug%3D%3D; sg=y99; _nk_=%5Cu5E05%5Cu6C14%5Cu5CF0happy; cookie1=UNJSu1S2nK7AhsBSrVKq4Nd7T4K1fH40ygcHPrTYWeA%3D; lastalitrackid=login.taobao.com; tfstk=ccVGB7cFtRk12ge_PFG_ovbuN0aGaiXZfSPUT5GPww8ivnNE7sYkLLSdMwmTjSpf.; uc1=cookie16=URm48syIJ1yk0MX2J7mAAEhTuw%3D%3D&cookie21=VT5L2FSpccLuJBreK%2BBd&cookie15=UIHiLt3xD8xYTw%3D%3D&existShop=false&pas=0&cookie14=UoTUM2YYf7HXaw%3D%3D; mt=ci=21_1; v=0; JSESSIONID=EC27C014A7BFB337D51D380F31E05C14; l=eBTRDRMIq3U2_UibBOfwourza77OSIRAguPzaNbMiOCPO_fp5GiGWZb3Hg89C3GVh67HR3J_JUNTBeYBqIv4n5U62j-la_kmn; isg=BA4O1D8B3Mb76GvkyGwHSEkCX-TQj9KJQPZkSjhXepHMm671oB8imbRZ08f3g8qh',
              'user-agent':'Mozilla/5.0'}
        try:
            r = requests.get(url, headers=kv,timeout=30)
            r.raise_for_status()
            r.encoding = r.apparent_encoding
            #print(r.text)
            return r.text
        except:
            return ""
    
    
    def parsePage(ilt, html):
        try:
            plt = re.findall(r'"view_price":"[d.]*"', html)
            tlt = re.findall(r'"raw_title":".*?"', html)
            pic=re.findall(r'"pic_url":".*?"',html)
            k=0
            kind="甘果类零食"
            for i in range(len(plt)):
                price = eval(plt[i].split(':')[1])
                title = eval(tlt[i].split(':')[1])
                img="https:"+eval(pic[i].split(':')[1])
                oldprice=price.replace('1','4')
                ilt.append([k,title,price,oldprice,100,img,kind])
        except:
            print("")
    
    
    def printGoodsList(ilt):
        db = pymysql.connect("localhost", "root", "511924", "summerperiod", charset='utf8')
        cursor = db.cursor()
        sql_cixian = "INSERT INTO food values (%s,%s,%s,%s,%s,%s,%s)"
        cursor.executemany(sql_cixian, ilt)
        db.commit()
        db.close()
    
        tplt = "{:4}	{:8}	{:16}	{:16}"
        print(tplt.format("序号", "价格", "商品名称","商品图片"))
        count = 0
        for g in ilt:
            count = count + 1
            print(tplt.format(count, g[0], g[1],g[2]))
    
    
    def main():
        goods = '甘果类零食'
        depth = 2
        start_url = 'https://s.taobao.com/search?q=' + goods
        infoList = []
        for i in range(depth):
            try:
                url = start_url + '&s=' + str(44 * i)
                html = getHTMLText(url)
                parsePage(infoList, html)
            except:
                continue
        printGoodsList(infoList)
    
    
    main()

     更改后缀和链接就可以爬取你想要的商品。(这里是以食品为例)

  • 相关阅读:
    txtexcelcvsxml存储测试数据
    webstorm 格式化代码(CTR+ALT+L)快捷键失效?
    解决jQuery触发dbclick事件同时也执行click事件
    css经典布局——头尾固定高度中间高度自适应布局
    js 如何访问跨域的iframe的元素
    获取textarea文本框所选字符光标位置索引,以及选中的文本值;textarea高度自适应,随着内容增加高度增加;获取输入框中的光标位置
    js 如何计算当年清明节日期
    验证插件使用笔记
    node 升级之后 执行gulp报错解决方案
    scss css管理相关
  • 原文地址:https://www.cnblogs.com/dazhi151/p/14297272.html
Copyright © 2011-2022 走看看