zoukankan      html  css  js  c++  java
  • Python爬取淘宝商品信息写入mysql

    直接上代码:(商品名称、单价、图片链接)

    import pymysql
    import requests
    import re
    
    
    def getHTMLText(url):
        kv = {'cookie':'thw=cn; hng=CN%7Czh-CN%7CCNY%7C156; enc=ooWAQ8HPiBkBlDgWaQ2BoQXFD4cHXejeOP0Nq7xvbCuGN5yubT%2ByBjrb2j417KSrQkoR9YQxMFoqYufejy7Hlw%3D%3D; _m_h5_tk=9cc0be22588c97655e9e0ed031f29703_1589472803622; _m_h5_tk_enc=8fd3fcd9077f0f17bcb2dc4f9d593617; cookie2=1a0da2cc9535ebe4f7bd2787bebb9da1; t=0a472589a79eda4e33e9b072e3446525; _tb_token_=e136e0330e37e; alitrackid=www.taobao.com; _samesite_flag_=true; cna=cAVGFNpuDkUCAXkcRV0prgNa; sgcookie=EGxNSorLw1t5Dg21WTFJw; unb=3361002229; uc3=nk2=qA%2Fo8e0UjX1l%2BUs%3D&lg2=UtASsssmOIJ0bQ%3D%3D&id2=UNN78Eg15kheYA%3D%3D&vt3=F8dBxGZobO%2BfXgtBG40%3D; csg=228d2d4a; lgc=%5Cu5E05%5Cu6C14%5Cu5CF0happy; cookie17=UNN78Eg15kheYA%3D%3D; dnk=%5Cu5E05%5Cu6C14%5Cu5CF0happy; skt=aa63ca1a4a6e356c; existShop=MTU4OTYzNTEyOQ%3D%3D; uc4=nk4=0%40qjS8tzpCQQHfNZapqmDNrmd4%2F2Dhnw%3D%3D&id4=0%40UgQz06zOiEqwpAtViK7HqZlIKslx; tracknick=%5Cu5E05%5Cu6C14%5Cu5CF0happy; _cc_=U%2BGCWk%2F7og%3D%3D; _l_g_=Ug%3D%3D; sg=y99; _nk_=%5Cu5E05%5Cu6C14%5Cu5CF0happy; cookie1=UNJSu1S2nK7AhsBSrVKq4Nd7T4K1fH40ygcHPrTYWeA%3D; lastalitrackid=login.taobao.com; tfstk=ccVGB7cFtRk12ge_PFG_ovbuN0aGaiXZfSPUT5GPww8ivnNE7sYkLLSdMwmTjSpf.; uc1=cookie16=URm48syIJ1yk0MX2J7mAAEhTuw%3D%3D&cookie21=VT5L2FSpccLuJBreK%2BBd&cookie15=UIHiLt3xD8xYTw%3D%3D&existShop=false&pas=0&cookie14=UoTUM2YYf7HXaw%3D%3D; mt=ci=21_1; v=0; JSESSIONID=EC27C014A7BFB337D51D380F31E05C14; l=eBTRDRMIq3U2_UibBOfwourza77OSIRAguPzaNbMiOCPO_fp5GiGWZb3Hg89C3GVh67HR3J_JUNTBeYBqIv4n5U62j-la_kmn; isg=BA4O1D8B3Mb76GvkyGwHSEkCX-TQj9KJQPZkSjhXepHMm671oB8imbRZ08f3g8qh',
              'user-agent':'Mozilla/5.0'}
        try:
            r = requests.get(url, headers=kv,timeout=30)
            r.raise_for_status()
            r.encoding = r.apparent_encoding
            #print(r.text)
            return r.text
        except:
            return ""
    
    
    def parsePage(ilt, html):
        try:
            plt = re.findall(r'"view_price":"[d.]*"', html)
            tlt = re.findall(r'"raw_title":".*?"', html)
            pic=re.findall(r'"pic_url":".*?"',html)
            k=0
            kind="甘果类零食"
            for i in range(len(plt)):
                price = eval(plt[i].split(':')[1])
                title = eval(tlt[i].split(':')[1])
                img="https:"+eval(pic[i].split(':')[1])
                oldprice=price.replace('1','4')
                ilt.append([k,title,price,oldprice,100,img,kind])
        except:
            print("")
    
    
    def printGoodsList(ilt):
        db = pymysql.connect("localhost", "root", "511924", "summerperiod", charset='utf8')
        cursor = db.cursor()
        sql_cixian = "INSERT INTO food values (%s,%s,%s,%s,%s,%s,%s)"
        cursor.executemany(sql_cixian, ilt)
        db.commit()
        db.close()
    
        tplt = "{:4}	{:8}	{:16}	{:16}"
        print(tplt.format("序号", "价格", "商品名称","商品图片"))
        count = 0
        for g in ilt:
            count = count + 1
            print(tplt.format(count, g[0], g[1],g[2]))
    
    
    def main():
        goods = '甘果类零食'
        depth = 2
        start_url = 'https://s.taobao.com/search?q=' + goods
        infoList = []
        for i in range(depth):
            try:
                url = start_url + '&s=' + str(44 * i)
                html = getHTMLText(url)
                parsePage(infoList, html)
            except:
                continue
        printGoodsList(infoList)
    
    
    main()

     更改后缀和链接就可以爬取你想要的商品。(这里是以食品为例)

  • 相关阅读:
    监控里的主码流和子码流是什么意思
    监控硬盘容量计算
    一个能让你了解所有函数调用顺序的Android库
    电工选线
    oracle linux dtrace
    list all of the Oracle 12c hidden undocumented parameters
    Oracle Extended Tracing
    window 驱动开发
    win7 x64 dtrace
    How to Use Dtrace Tracing Ruby Executing
  • 原文地址:https://www.cnblogs.com/dazhi151/p/14297272.html
Copyright © 2011-2022 走看看