zoukankan      html  css  js  c++  java
  • 店铺商品id爬取

    import requests
    from bs4 import  BeautifulSoup
    import lxml
    import re
    import time
    import random
    import pymysql.cursors
    connection = pymysql.connect(host='localhost',
                                 user='root',
                                 password='123',
                                 db='asd',
                                 charset='utf8mb4',
                                 cursorclass=pymysql.cursors.DictCursor)
    payload = {
        "Ancoding":"gzip, deflate, sdch, br",
    "Accept-Language":"zh-CN,zh;q=0.8",
    "Connection":"keep-alive",
    "Cookie":"hng=; uss=UIMY14A%2B04Bbq%2BqRxS6C9OzJWudsw14Q1kb5mDDqxW%2BQ3YG%2BUcpgrDRWnRQ%3D; uc3=sg2=AC4AfXCJ7XkLw0gCUD1tD9ZxhXFdweN2A6VfybWadxI%3D&nk2=&id2=&lg2=; t=3c0787f77a28e0854ef28fc360b2c555; cookie2=1c912d33e44bdb2008763748702a61f4; _tb_token_=78577371d8136; l=AiQkmjyCyPnG7qTN1Iu5fBqvdCgWvUgn; isg=AvDwL_qYXdDeegACSXGXiIOKwb7f2NSDXgsSOepBvMsepZFPkkmkE0aNixo_; pnm_cku822=; cna=T7gREcWMLDsCAavWmjBJPJpS; Hm_lvt_c478afee593a872fd45cb9a0d7a9da3b=1495496950; Hm_lpvt_c478afee593a872fd45cb9a0d7a9da3b=1495496950",
    "Host":"tanggulake.tmall.com",
    "Referer":"https://tanggulake.tmall.com/search.htm?spm=a220o.1000855.w5002-15900729481.1.b3kpys&search=y",
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
    "X-Requested-With":"XMLHttpRequest"}
    
    with connection.cursor() as cursor:
        # Create a new
        sql = "select * from 竞店"
        cursor.execute(sql)
        q = cursor.fetchall()
        # connection is not autocommit by default. So you must commit to save
        # your changes.
        connection.commit()
    
    
    
    for i in q:
        url =i["地址"]
        url_re = requests.get(url+"1", params=payload)
        soup = BeautifulSoup(url_re.text, "lxml")
        pig=soup.select("div >  div > div > div > span:nth-of-type(1)")
        get_pig=(pig[2].text.split("/"))[1]
        print(get_pig)
        ids=[]
        for pij in range(1,int(get_pig)+1):
            time.sleep(random.randrange(1,5))
            ur1=i["地址"]+str(pij)
            url_re1=requests.get(ur1,params=payload)
            soup=BeautifulSoup(url_re1.text,"lxml")
            date = soup.select("div > div > div > dl")
            for spid in date:
                ids.append(re.sub("D", "", spid.get("data-id")))
    
        with connection.cursor() as cursor:
                # Create a new
            sql = 'select id from'+" " +i["店铺名称"]
            cursor.execute(sql)
            q = cursor.fetchall()
            q = [i["id"] for i in q]
            for w in ids:
                if w not in q:
    
                    sql = "INSERT INTO "+i["店铺名称"]+ "(`id`) VALUES (%s)"
                    cursor.execute(sql, w)
    
                        # connection is not autocommit by default. So you must commit to save
                        # your changes.
        connection.commit()
  • 相关阅读:
    160912、工具类:spring+springmvc自定义编码转换
    160909、Filter多方式拦截、禁用IE图片缓存、Filter设置字符编码
    160908、前端开发框架Semantic UI
    160907、CSS 预处理器-Less
    160906、Dubbo与Zookeeper、SpringMVC整合和使用(负载均衡、容错)
    160905、c3p0详细配置
    160902、Ionic、Angularjs、Cordova搭建Android开发环境
    160901、在大型项目中组织CSS
    160831、过滤器和拦截器的区别
    (转)Uiautomator——API详解
  • 原文地址:https://www.cnblogs.com/gao-xiang/p/6949794.html
Copyright © 2011-2022 走看看