zoukankan      html  css  js  c++  java
  • 淘宝1688商品爬取,适用于爬取频率不高的情况下

    1.前提

    基于python3.6 依赖包 selenium ,xlwt,pandas 

    需要根据自己chrome浏览器的版本下载对应的chromedriver

    查看chrome版本号

    点击 帮助  >  关于google

    去下载对应的chromedriver : https://chromedriver.chromium.org/downloads

    2.代码

    #设置初始存放目录
    #1,初始化文件夹
    #2,设置需要爬取的链接
    #3,对链接进行爬去
    4,解析1688列表页
    #5,持久化到表格
    #6,标记已经爬过的链接
    #7,合成一个表格

    代码如下: 大家使用的时候,记得把方法都放到 main 之前,我是为了方便大家查看,才把mian方法写在前面
      1 import os,xlwt,time
      2 import pandas as pd
      3 from bs4 import BeautifulSoup
      4 from selenium import webdriver
      5 from datetime import datetime
      6 
      7 
      8 
      9 #chromedriver 地址,记得换成自己的
     10 CHROME_DRIVER_PATH = '/Users/huangmengfeng/PycharmProjects/2020_study_plan/chromedriver'
     11 
     12 
     13 
     14 if __name__ == '__main__':
     15     #设置初始存放目录,记得换成自己的初始目录,只需要设置初始目录,后面的目录会自动生成,输出文件在out里面
     16     base_dir_0 = '/Users/huangmengfeng/Desktop/1688_SCRIPY'
     17     site_name_0 = 's.1688.com'
     18     
     19     #1,初始化文件夹
     20     tmp_data_dir_0,out_dir_0,record_path_0 = createDir(base_dir_0,site_name_0)
     21     done_url = getDoneUrl(record_path_0)
     22     #2,设置需要爬取的链接
     23     need_url_list = [{'page_url':'https://s.1688.com/selloffer/offer_search.htm?keywords=%C5%B7%C3%C0%C5%AE%D7%B0&n=y&netType=1%2C11%2C16','cate_name':'欧美女装','total_num':50}]
     24     #3,对链接进行爬去
     25     for need_url in need_url_list:
     26         for i in range(1,2):
     27             now_page_url = "{0}&beginPage={1}#sm-filtbar".format(need_url['page_url'],i)
     28             #跳过已经爬过的链接
     29             if now_page_url not in done_url:
     30                 #4,解析1688列表页,模拟滑动,获得全部的商品,页面会有懒加载
     31                 soup_0 = get_dynamic_html2(now_page_url)
     32                 now_page_info_list = analysis1688Page(soup_0)
     33                 #5,持久化到表格
     34                 heads_0 = ['shop_name','shop_url','desc','url','price']
     35                 exportToExcel(heads_0,now_page_info_list,tmp_data_dir_0,'{0}_{1}.xlsx'.format(need_url['cate_name'],i))
     36                 #6,标记已经爬过的链接
     37                 addDoneUrl(record_path_0,now_page_url)
     38     #7,合成一个表格
     39     connectToOne(tmp_data_dir_0,out_dir_0,"{0}_{1}.xlsx".format(site_name_0,datetime.strftime(datetime.now(),'%Y%m%d%H%M%S')))
     40 
     41 
     42 
     43 #初始化文件夹
     44 def createDir(base_dir,site_name):
     45     tmp_data_dir_1 = "{0}/{1}".format(base_dir,site_name)
     46     tmp_data_dir_2 = "{0}/{1}/{2}".format(base_dir,site_name,datetime.strftime(datetime.now(),'%Y%m%d'))
     47     out_dir = '{0}/OUT'.format(base_dir)
     48     path_arr = [base_dir,tmp_data_dir_1,tmp_data_dir_2,out_dir]
     49     for path in path_arr:
     50         if not os.path.exists(path):
     51             os.mkdir(path)
     52     record_path = '{0}/record.txt'.format(tmp_data_dir_2)
     53     if not os.path.exists(record_path):
     54         with open(record_path, 'a+', encoding="utf-8") as f:
     55             f.write('')
     56             f.close()
     57     return tmp_data_dir_2,out_dir,record_path
     58 
     59 
     60 
     61 # 下载动态界面
     62 def get_dynamic_html2(site_url):
     63     print('开始加载', site_url, '动态页面')
     64     chrome_options = webdriver.ChromeOptions()
     65     # ban sandbox
     66     chrome_options.add_argument('--no-sandbox')
     67     chrome_options.add_argument('--disable-dev-shm-usage')
     68     # use headless
     69     # chrome_options.add_argument('--headless')
     70     chrome_options.add_argument('--disable-gpu')
     71     chrome_options.add_argument('--ignore-ssl-errors')
     72     driver = webdriver.Chrome(executable_path=CHROME_DRIVER_PATH, chrome_options=chrome_options)
     73     # print('dynamic laod web is', site_url)
     74     driver.set_page_load_timeout(100)
     75     driver.set_window_size(1920, 1080)
     76     # driver.set_script_timeout(100)
     77     try:
     78         driver.get(site_url)
     79     except Exception as e:
     80         driver.execute_script('window.stop()')  # 超出时间则不加载
     81         print(e, 'dynamic web load timeout')
     82 
     83     time.sleep(2)
     84 
     85     fullpage_screenshot(driver, 8000)
     86 
     87     data2 = driver.page_source
     88     soup2 = BeautifulSoup(data2, 'html.parser')
     89 
     90     try:
     91         time.sleep(3)
     92         driver.quit()
     93     except:
     94         pass
     95     return soup2
     96 
     97 
     98 
     99 # 模拟滚动
    100 def fullpage_screenshot(driver, total_height):
    101     total_width = driver.execute_script("return document.body.offsetWidth")
    102     # total_height = driver.execute_script("return document.body.parentNode.scrollHeight")
    103     # total_height = 50000
    104     viewport_width = driver.execute_script("return document.body.clientWidth")
    105     viewport_height = driver.execute_script("return window.innerHeight")
    106     rectangles = []
    107 
    108     i = 0
    109     while i < total_height:
    110         ii = 0
    111         top_height = i + viewport_height
    112 
    113         if top_height > total_height:
    114             top_height = total_height
    115 
    116         while ii < total_
    117             top_width = ii + viewport_width
    118 
    119             if top_width > total_
    120                 top_width = total_width
    121             rectangles.append((ii, i, top_width, top_height))
    122 
    123             ii = ii + viewport_width
    124 
    125         i = i + viewport_height
    126 
    127     previous = None
    128     part = 0
    129 
    130     for rectangle in rectangles:
    131         if not previous is None:
    132             driver.execute_script("window.scrollTo({0}, {1})".format(rectangle[0], rectangle[1]))
    133             print("Scrolled To ({0},{1})".format(rectangle[0], rectangle[1]))
    134             time.sleep(0.5)
    135 
    136         file_name = "part_{0}.png".format(part)
    137         print("Capturing {0} ...".format(file_name))
    138 
    139         # driver.get_screenshot_as_file(file_name)
    140 
    141         if rectangle[1] + viewport_height > total_height:
    142             offset = (rectangle[0], total_height - viewport_height)
    143         else:
    144             offset = (rectangle[0], rectangle[1])
    145 
    146         print("Adding to stitched image with offset ({0}, {1})".format(offset[0], offset[1]))
    147         part = part + 1
    148         previous = rectangle
    149     print("Finishing chrome full page screenshot workaround...")
    150     return True
    151 
    152 
    153 
    154 #解析1688列表页
    155 def analysis1688Page(soup):
    156     info_tag_list = soup.select('.sm-offer-item')
    157     now_page_info_list = []
    158     for info_tag in info_tag_list:
    159         skc = info_tag.attrs['trace-obj_value']
    160         a_tag_list = info_tag.select('a')
    161         price_tag_lsit = info_tag.select('span.sm-offer-priceNum')
    162         # print(info_tag)
    163         img_tag = a_tag_list[0].select('img')[0]
    164 
    165         shop_tag = a_tag_list[2]
    166         desc = img_tag.attrs['alt']
    167         url = 'https://detail.1688.com/offer/{0}.html'.format(skc)
    168         shop_name = shop_tag.text
    169         shop_url = shop_tag.attrs['href']
    170 
    171         price = ''
    172         if len(price_tag_lsit) > 0:
    173             price_tag = price_tag_lsit[0]
    174             price = price_tag.attrs['title']
    175         print({'shop_name': shop_name, 'shop_url': shop_url, 'desc': desc, 'url': url, 'price': price})
    176         now_page_info_list.append(
    177             {'shop_name': shop_name, 'shop_url': shop_url, 'desc': desc, 'url': url, 'price': price})
    178     return now_page_info_list
    179 
    180 
    181 
    182 #将数据持久化到表格
    183 def exportToExcel(heads,task_done,path,filename):
    184     if not os.path.exists(path):
    185         os.makedirs(path)
    186     task_xls = xlwt.Workbook(encoding='utf-8')
    187     task_sheet1 = task_xls.add_sheet('sheet1')
    188     #表头
    189     header_allign = xlwt.Alignment()
    190     header_allign.horz = xlwt.Alignment.HORZ_CENTER
    191     header_style = xlwt.XFStyle()
    192     header_style.alignment = header_allign
    193     for i in  range(len(heads)):
    194         task_sheet1.col(i).width = 12000
    195         task_sheet1.write(0,i,heads[i],header_style)
    196     #开始插入
    197     for i in range(len(task_done)):
    198         for j in range(len(heads)):
    199             task_sheet1.write(i+1,j,task_done[i][heads[j]])
    200     print(os.path.join(path,filename))
    201     task_xls.save(os.path.join(path,filename))
    202     return filename
    203 
    204 
    205 
    206 #获得已经爬去的链接
    207 def getDoneUrl(path):
    208     done_url = []
    209     with open(path, 'r', encoding="utf-8") as f:
    210         url_list = f.readlines()
    211         for url in url_list:
    212             done_url.append(url.rstrip('
    '))
    213         print(done_url)
    214     return done_url
    215 
    216 
    217 #将爬去过的url记录下来
    218 def addDoneUrl(path,content):
    219     try:
    220         with open(path, 'a+', encoding="utf-8") as f:
    221             f.write(content + '
    ')
    222             f.close()
    223     except Exception as e:
    224         print(e)
    225 
    226 
    227 
    228 #汇总全部数据
    229 def connectToOne(dir, to_dir, out_file_name):
    230     excel_df = pd.DataFrame()
    231     for file in os.listdir(dir):
    232         if file.endswith('.xlsx'):
    233             print("file:", file)
    234             excel_df = excel_df.append(
    235                 pd.read_excel(os.path.join(dir, file), dtype={'url': str}, ))
    236     print('开始合并')
    237     excel_df['currency'] = '$'
    238     writer = pd.ExcelWriter(os.path.join(to_dir, out_file_name), engine='xlsxwriter',
    239                             options={'strings_to_urls': False})
    240 
    241     excel_df.to_excel(writer,index=False)
    242     writer.close()

     

  • 相关阅读:
    Java 利用SWFUpload多文件上传 session 为空失效,不能验证的问题 swfUpload多文件上传
    对ExtJS4应用 性能优化的几点建议
    Extjs4中用combox做下拉带图片的下拉框
    当你的才华还撑不起你的野心时,就应该静下心来学习(转)
    占位符行为 PlaceHolderBehavior 的实现以及使用
    一个简单的TabItem样式。
    WPF实现Twitter按钮效果(转)
    模仿36。杀毒~button(转)
    WPF自适应可关闭的TabControl 类似浏览器的标签页(转)
    WPF绘制简单常用的Path(转)
  • 原文地址:https://www.cnblogs.com/lelexiu/p/12896759.html
Copyright © 2011-2022 走看看