zoukankan      html  css  js  c++  java
  • 淘宝爬虫

    import  requests
    import re
    from urllib import request
    import urllib.request
    import pymysql
    import time
    conn=pymysql.connect(host="127.0.0.1",user="root",passwd="123456",db="world")
    def get_url():
        for i in  range(5,11):
            headers1 = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36'
            }
            url='https://s.taobao.com/search?spm=a21bo.2017.201856-fline.2.1b3311d9sSXobt&q=%E5%9B%9B%E4%BB%B6%E5%A5%97&s='+str(i*44)#链接入口
            response1 = requests.get(url, headers=headers1)
            data = response1.text
            bt_1 = '"raw_title":"(.*?)","pic_url"'
            tp_1='"pic_url":"//(.*?)"'
            spid_1='"nid":"(.*?)","category"'
            xl_1='"view_sales":"(.*?)"'
            dm_1= '"nick"."(.*?)"'
            jg_1='"view_price"."(.*?)","view_fee"'
            user_id1='"user_id":"(.*?)","nick":".*?"'
            comment_url1='"detail_url":"(.*?)"'
            bt = re.compile(bt_1).findall(str(data))
            tp=re.compile(tp_1).findall(str(data))
            spid=re.compile(spid_1).findall(str(data))
            print(spid)
            xl=re.compile(xl_1).findall(str(data))
            dm=re.compile(dm_1).findall(str(data))
            jg=re.compile(jg_1).findall(str(data))
            for j in range(0,len(bt)):
                bt1=bt[j]
                tp1='https://'+tp[j]
                spid1=spid[j]
                xl1=xl[j]
                dm1=dm[j]
                headers = {
                        'Referer': 'https://item.taobao.com/item.htm?spm=a1z10.5-c-s.w4002-18518582505.20.6d887041nVz3D2&id='+spid1 ,#必须加上这个
                        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36'
                    }
                response = requests.get(
                        'https://detailskip.taobao.com/service/getData/1/p1/item/detail/sib.htm?itemId='+spid1+'&sellerId=102291787&modules=dynStock,qrcode,viewer,price,duty,xmpPromotion,delivery,activity,fqg,zjys,couponActivity,soldQuantity,originalPrice,tradeContract&callback=onSibRequestSuccess',#这个网址藏了销量数据
                        headers=headers)
                data3 = response.text
                sold = '"sellCountDO":{"sellCount":"(.*?)","success":true}'#正则匹配销量
                soldTotalCount = re.compile(sold).findall(data3)
                print('' + str(i+1) + "" + '' + str(j+1) + '')
                print(bt1)
                print(spid1)
                print(soldTotalCount)
                jg1 = jg[j]
                sql = "insert into taobaopc1(bt,tp,spid,xl,dm,jg)values('" + bt1 + "','" + tp1 + "','" + spid1 + "','" + soldTotalCount1 + "','" + dm1 + "','" + jg1 + "')"
                print(sql)
                conn.query(sql)
                conn.commit()
  • 相关阅读:
    html 问题
    bookshelf
    requireJS 用法
    autoprefixer
    移动端 代码块
    D3 学习资源
    折线图
    iscroll 4 下拉 上拉 加载
    iscroll
    重金悬赏的微软:提交Win8漏洞以及发布Win8应用
  • 原文地址:https://www.cnblogs.com/snackpython/p/10329298.html
Copyright © 2011-2022 走看看