zoukankan      html  css  js  c++  java
  • 产品

    import requests
    from bs4 import  BeautifulSoup
    import lxml
    import re
    import time
    import random
    import pymysql.cursors
    from selenium import webdriver
    import pandas
    import numpy
    connection = pymysql.connect(host='localhost',
                                 user='root',
                                 password='123',
                                 db='asd',
                                 charset='utf8mb4',
                                 cursorclass=pymysql.cursors.DictCursor)
    payload = {
        "Ancoding":"gzip, deflate, sdch, br",
    "Accept-Language":"zh-CN,zh;q=0.8",
    "Connection":"keep-alive",
    "Cookie":"hng=; uss=UIMY14A%2B04Bbq%2BqRxS6C9OzJWudsw14Q1kb5mDDqxW%2BQ3YG%2BUcpgrDRWnRQ%3D; uc3=sg2=AC4AfXCJ7XkLw0gCUD1tD9ZxhXFdweN2A6VfybWadxI%3D&nk2=&id2=&lg2=; t=3c0787f77a28e0854ef28fc360b2c555; cookie2=1c912d33e44bdb2008763748702a61f4; _tb_token_=78577371d8136; l=AiQkmjyCyPnG7qTN1Iu5fBqvdCgWvUgn; isg=AvDwL_qYXdDeegACSXGXiIOKwb7f2NSDXgsSOepBvMsepZFPkkmkE0aNixo_; pnm_cku822=; cna=T7gREcWMLDsCAavWmjBJPJpS; Hm_lvt_c478afee593a872fd45cb9a0d7a9da3b=1495496950; Hm_lpvt_c478afee593a872fd45cb9a0d7a9da3b=1495496950",
    "Host":"tanggulake.tmall.com",
    "Referer":"https://tanggulake.tmall.com/search.htm?spm=a220o.1000855.w5002-15900729481.1.b3kpys&search=y",
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
    "X-Requested-With":"XMLHttpRequest"}
    
    with connection.cursor() as cursor:
        # Create a new
        sql = "select * from 竞店"
        cursor.execute(sql)
        q = cursor.fetchall()
        # connection is not autocommit by default. So you must commit to save
        # your changes.
    connection.commit()
    
    
    def id_s():
        for dress in q:
            url =dress["地址"]
            url_re = requests.get(url+"1", params=payload)
            soup = BeautifulSoup(url_re.text, "lxml")
            pig=soup.select("div >  div > div > div > span:nth-of-type(1)")
            get_pig=(pig[2].text.split("/"))[1]
            print(get_pig)
            ids=[]
            pigg=[]
            dates1=[]
            for pij in range(1,int(get_pig)+1):
                time.sleep(random.randrange(1,5))
                ur1=dress["地址"]+str(pij)
                url_re1=requests.get(ur1,params=payload)
                soup=BeautifulSoup(url_re1.text,"lxml")
                date = soup.select("div > div > div > dl")
                for spid in date:
                    ids.append(re.sub("D", "", spid.get("data-id")))
    
                date = soup.select("div > div > div > dl")
                imgs = soup.select("img")  # 图片
                for imgasd in imgs:
                    w = imgasd.get("src")
                    p = re.match(r".*//(.*?.jpg)", w)
                    pigg.append(r"https://" + p.group(1))
                shuju2 = pandas.DataFrame(pigg)
                shuju2 = shuju2.rename(columns={0: "图片链接"})
                date = soup.select("div > div > div > dl")
                dated = soup.select("dl")  # 获取网页信息
                for i in dated:
                    c = list(i.stripped_strings)  # 删除空格
                    b = [elem for elem in c if elem != '']  # 过滤
                    dates1.append([b[0], b[2]])
            shuju2 = pandas.DataFrame(pigg)
            shuju2 = shuju2.rename(columns={0: "图片链接"})
            shuju3 = pandas.DataFrame(ids)
            shuju3 = shuju3.rename(columns={0: "id"})
            shuju1 = pandas.DataFrame(dates1)  # 写入
            shuju1 = shuju1.rename(columns={0: "标题", 1: "价格"})
            result = pandas.concat([shuju1, shuju2, shuju3], axis=1)
            with connection.cursor() as cursor:
                    # Create a new
                sql = 'select id from'+" " +dress["店铺名称"]
                cursor.execute(sql)
                fff = cursor.fetchall()
                fff = [i["id"] for i in fff]
                for w in result.values:
                    if w[3] not in q:
    
                        sql = "INSERT INTO "+dress["店铺名称"]+ "(`id`,图片链接,价格,标题) VALUES (%s,%s,%s,%s)"
                        cursor.execute(sql, (w[3],w[2],w[1],w[0]))
    
                            # connection is not autocommit by default. So you must commit to save
                            # your changes.
            connection.commit()
    id_s()
    driver = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true', '--load-images=false'])
    with connection.cursor() as cursor:
                # Create a new
        for i in q:
            sql = 'select id from' + " " + i["店铺名称"]
            cursor.execute(sql)
            q=cursor.fetchall()
            ids = [i["id"] for i in q]
            for ids_find in ids:
                driver.get("http://item.taobao.com/item.htm?id="+ids_find)
                time.sleep(10)
                date = driver.page_source
                soup = BeautifulSoup(date, "lxml")
                color = re.findall(r'<li title="(.*)">颜色分类|li title="(.*)">主要颜色', str(soup.select(".attributes-list")))
                color = [i for i in color[0] if i]
                leimu = soup.select(".tb-pine")[0].get("data-catid")
                a = ""
                title = a.join(re.findall("([u4e00-u9fa5])", driver.title))
                id_dress="http://item.taobao.com/item.htm?id="+ids_find
                dates = [  color, leimu,id_dress]
                with connection.cursor() as cursors:
                    # Create a new
                    sql = "UPDATE "+ i["店铺名称"]+ " SET 颜色='%s',类目='%s',商品地址='%s' where id = '%s'" % (dates[0][0], dates[1], dates[2],ids_find)
                    print(sql)
                    cursors.execute(sql)
                connection.commit()
    connection.commit()
  • 相关阅读:
    算法
    ximalaya-spider
    无名小站
    python send email
    spider-bilibili
    windows镜像
    python 爬取豆瓣电影写入到excel中
    pdf 转 word
    文档分割、合并
    文档合并
  • 原文地址:https://www.cnblogs.com/gao-xiang/p/6953461.html
Copyright © 2011-2022 走看看