import requests from bs4 import BeautifulSoup import lxml import re import time import random import pymysql.cursors from selenium import webdriver connection = pymysql.connect(host='localhost', user='root', password='123', db='asd', charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor) payload = { "Ancoding":"gzip, deflate, sdch, br", "Accept-Language":"zh-CN,zh;q=0.8", "Connection":"keep-alive", "Cookie":"hng=; uss=UIMY14A%2B04Bbq%2BqRxS6C9OzJWudsw14Q1kb5mDDqxW%2BQ3YG%2BUcpgrDRWnRQ%3D; uc3=sg2=AC4AfXCJ7XkLw0gCUD1tD9ZxhXFdweN2A6VfybWadxI%3D&nk2=&id2=&lg2=; t=3c0787f77a28e0854ef28fc360b2c555; cookie2=1c912d33e44bdb2008763748702a61f4; _tb_token_=78577371d8136; l=AiQkmjyCyPnG7qTN1Iu5fBqvdCgWvUgn; isg=AvDwL_qYXdDeegACSXGXiIOKwb7f2NSDXgsSOepBvMsepZFPkkmkE0aNixo_; pnm_cku822=; cna=T7gREcWMLDsCAavWmjBJPJpS; Hm_lvt_c478afee593a872fd45cb9a0d7a9da3b=1495496950; Hm_lpvt_c478afee593a872fd45cb9a0d7a9da3b=1495496950", "Host":"tanggulake.tmall.com", "Referer":"https://tanggulake.tmall.com/search.htm?spm=a220o.1000855.w5002-15900729481.1.b3kpys&search=y", "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36", "X-Requested-With":"XMLHttpRequest"} with connection.cursor() as cursor: # Create a new sql = "select * from 竞店" cursor.execute(sql) q = cursor.fetchall() # connection is not autocommit by default. So you must commit to save # your changes. connection.commit() def id_s(): for i in q: url =i["地址"] url_re = requests.get(url+"1", params=payload) soup = BeautifulSoup(url_re.text, "lxml") pig=soup.select("div > div > div > div > span:nth-of-type(1)") get_pig=(pig[2].text.split("/"))[1] print(get_pig) ids=[] for pij in range(1,int(get_pig)+1): time.sleep(random.randrange(1,5)) ur1=i["地址"]+str(pij) url_re1=requests.get(ur1,params=payload) soup=BeautifulSoup(url_re1.text,"lxml") date = soup.select("div > div > div > dl") for spid in date: ids.append(re.sub("D", "", spid.get("data-id"))) with connection.cursor() as cursor: # Create a new sql = 'select id from'+" " +i["店铺名称"] cursor.execute(sql) q = cursor.fetchall() q = [i["id"] for i in q] for w in ids: if w not in q: sql = "INSERT INTO "+i["店铺名称"]+ "(`id`) VALUES (%s)" cursor.execute(sql, w) # connection is not autocommit by default. So you must commit to save # your changes. connection.commit() driver = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true', '--load-images=false']) with connection.cursor() as cursor: # Create a new for i in q: sql = 'select id from' + " " + i["店铺名称"] cursor.execute(sql) q=cursor.fetchall() ids = [i["id"] for i in q] for ids_find in ids: driver.get("http://item.taobao.com/item.htm?id="+ids_find) time.sleep(10) date = driver.page_source soup = BeautifulSoup(date, "lxml") color = re.findall(r'<li title="(.*)">颜色分类|li title="(.*)">主要颜色', str(soup.select(".attributes-list"))) color = [i for i in color[0] if i] try: mig = "http:" + re.match(r'.*?(//.*?alicdn.*?)_50x50.jpg'.str(soup.select("#J_UlThumb > li > div > a > img"))).group(1) except: print(ids_find) mig="1" leimu = soup.select(".tb-pine")[0].get("data-catid") a = "" title = a.join(re.findall("([u4e00-u9fa5])", driver.title)) id_dress="http://item.taobao.com/item.htm?id="+ids_find dates = [mig, title, color, leimu,id_dress] with connection.cursor() as cursors: # Create a new sql = "UPDATE "+ i["店铺名称"]+ " SET 颜色='%s',类目='%s',图片地址='%s',标题='%s',商品地址='%s' where id = '%s'" % (dates[2][0], dates[3], dates[0], dates[1],dates[4],ids_find) print(sql) cursors.execute(sql) connection.commit() connection.commit()