zoukankan      html  css  js  c++  java
  • 白嫖爬虫了,代码可用,换成自己想要爬的网站再改一下网站解析那块代码

    前言:大大小小的电商网站爬了不少。结论就是分两种类型:
    第一:requests 直接获取

    第二:网页动态加载,requests获取失败

    直接分享代码吧

    1.先导入需要的库和chromedriver的地址(爬动态加载的网页需要,若是requests可直接获取的网站可忽略)

    import time,re,pandas as pd,os,requests
    from selenium import webdriver
    from bs4 import BeautifulSoup
    
    
    
    CHROME_DRIVER_PATH = '/Users/xxxx/PycharmProjects/爬虫/chromedriver'


    2.我先给出主函数,里面方法我会在下面贴出来

    我爬的是电商网站,自然是爬去列表页的商品信息(商品描述,商品链接,商品售价,商品原价)

    那么下面是爬静态网页的核心函数

    #处理静态网页的
    def dealSoup(now_soup,cate_name,cate_url,now_page_num):
    
        #获取有层级的分类
        cate_span_tag_list = now_soup.select('.category-breadcrumb  li ')
        cate_all_text = ''
        for span_tag in cate_span_tag_list:
            cate_all_text += f"{span_tag.text.strip()}"
    
        #获得页数
        total_page_num  = 1
        total_num_tag_list = now_soup.select('.site-pager li')
        if len(total_num_tag_list) == 0:
            pass
        elif len(total_num_tag_list) == 1:
            total_num_tag = total_num_tag_list[1]
            total_num = extractNum(total_num_tag.text)
            print(int(total_num))
            total_page_num = int(total_num)
        else:
            total_num_tag = total_num_tag_list[-2]
            total_num = extractNum(total_num_tag.text)
            print(int(total_num))
            total_page_num = int(total_num)
    
        #遍历全部商品
        tag_list = now_soup.select('.category-list div.item')
        if len(tag_list) > 0:
            print(len(tag_list))
            item_list = []
            for tag in tag_list:
                item = {
                    'cate_name_all' : cate_all_text[:-1],
                    'cate_name' : cate_name,
                    'cate_url' : cate_url,
                    'product_now_price' : 'null',
                    'product_old_price' : 'null'
                }
                desc_tag = tag.select('.name > a')[0]
    
                price_tag_list = tag.select('.my-shop-price')
    
    
                item['product_desc'] = desc_tag.text.strip()
                item['product_link'] = desc_tag.attrs['href']
                if len(price_tag_list) > 0:
                    item['product_now_price'] = price_tag_list[0].attrs['data-oprice']
                    item['product_old_price'] = price_tag_list[0].attrs['data-oprice']
                if len(price_tag_list) > 1:
                    item['product_old_price'] = price_tag_list[1].attrs['data-oprice']
                print(item)
                item_list.append(item)
    
            objListToExcel(item_list,heads_0,f"{save_dir}/{cate_name}_{now_page_num}.xlsx")
            return True,total_page_num
        else:
            return False,total_page_num
    
    
    
    if __name__ == "__main__":
        #需要爬去的列表页链接
        #cate_url:列表页url
        #cate_name:你对这个列表页的分类定义
        ALL_CATE_LIST = [
            {'cate_url': 'https://www.adorawe.net/category/denim-pants-c_808.html',
             'cate_name': 'Pants1'},
            {'cate_url': 'https://www.adorawe.net/category/casual-pants-c_809.html',
             'cate_name': 'Pants'},
        ]
        #设置一个文件加用来存爬取的信息
        save_dir = '/Users/xxxx/Desktop/adorawe'
        if not os.path.exists(save_dir):
            os.mkdir(save_dir)
        #开始爬列表页
        for cate_obj in ALL_CATE_LIST:
            #获得BeautifulSoup格式的网页文件
            soup = get_static_html(cate_obj['cate_url'])
            #处理网页,保存本页商品数据,获得该列表页的总页数
            go_status,page_num = dealSoup(soup, cate_obj['cate_name'], cate_obj['cate_url'], 1)
            #翻页爬取
            for i in range(1,page_num):
                body_url = cate_obj['cate_url'].replace('.html','')
                tmp_url = f"{body_url}-page-{i+1}.html"
                tmp_soup = get_static_html(tmp_url)
                go_status, page_num = dealSoup(tmp_soup, cate_obj['cate_name'], tmp_url, i+1)
        #因为是每页的商品数据单独保存,,所以需要合并成一个
        connectToOne(save_dir, '/Users/xxx/Desktop', 'adorawe.xlsx')
    

      下面是爬 动态网页的

    #处理动态加载网页的
    def dealSoup(driver,cate_name,cate_url,page_num):
        now_data = driver.page_source
        now_soup = BeautifulSoup(now_data, 'html.parser')
    
        #获取有层级的分类
        cate_span_tag_list = now_soup.select('ul.breadcrumb > li')
        cate_all_text = ''
        for cate_span in cate_span_tag_list:
            cate_all_text += f"{cate_span.text.strip()}/"
    
        #遍历全部商品
        tag_list = now_soup.select('div.product-list-container > .product-item')
        if len(tag_list) > 0:
            print(len(tag_list))
            item_list = []
            for tag in tag_list:
                item = {
                    'cate_name_all' : cate_all_text[:-1],
                    'cate_name' : cate_name,
                    'cate_url' : cate_url,
                    'product_now_price' : 'null',
                    'product_old_price' : 'null'
                }
                desc_tag = tag.select('.product-item-name')[0]
                link_tag = desc_tag.select('a')[-1]
                final_price_tag_list = tag.select('.product-item-final-price-js')
                del_price_tag_list = tag.select('.product-item-del-price-js')
    
                item['product_desc'] = desc_tag.text.strip()
                item['product_link'] = link_tag.attrs['href']
                if len(final_price_tag_list) > 0:
                    item['product_now_price'] = final_price_tag_list[0].text.strip()
                    item['product_old_price'] = final_price_tag_list[0].text.strip()
                if len(del_price_tag_list) > 0:
                    item['product_old_price'] = del_price_tag_list[0].text.strip()
                print(item)
                item_list.append(item)
    
            objListToExcel(item_list,heads_0,f"{save_dir}/{cate_name}_{page_num}.xlsx")
            return True
        else:
            return False
    
    
    
    if __name__ == "__main__":
        # 需要爬去的列表页链接
        # cate_url:列表页url
        # cate_name:你对这个列表页的分类定义
        # total_page:这个列表页的总页数
        ALL_CATE_LIST = [
            {'cate_url': 'https://sea.newchic.com/pajamas-and-robes-c-4185/?country=188&SEA=0',
             'cate_name': 'Loungewear',
             'total_page': 9},
            {'cate_url': 'https://sea.newchic.com/womens-shoes-c-3592/?country=188&SEA=0',
             'cate_name': 'Shoes',
             'total_page': 62 },
        ]
        #设置一个文件加用来存爬取的信息
        save_dir = '/Users/xxx/Desktop/newchic'
        if not os.path.exists(save_dir):
            os.mkdir(save_dir)
        #模拟浏览器打开网页
        site_url_0 = ALL_CATE_LIST[0]['cate_url']
        print('开始加载', site_url_0, '动态页面')
        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--disable-dev-shm-usage')
        chrome_options.add_argument('--disable-gpu')
        chrome_options.add_argument('--ignore-ssl-errors')
        driver = webdriver.Chrome(executable_path=CHROME_DRIVER_PATH, chrome_options=chrome_options)
        driver.set_page_load_timeout(100)
        driver.set_window_size(1420, 780)
        driver.get(site_url_0)
        #由于是懒加载,需要模拟滚动屏幕,是页面加载全部的商品
        #第二个参数是滚动距离,根据爬取的页面调整大小,使得商品全部加载即可
        fullpage_screenshot(driver,10000)
        time.sleep(5)
        #处理该页面,并存储到本地
        dealSoup(driver, ALL_CATE_LIST[0]['cate_name'], ALL_CATE_LIST[0]['cate_url'], 0)
        #开始爬列表页
        for cate_obj in ALL_CATE_LIST:
            driver.get(cate_obj['cate_url'])
            fullpage_screenshot(driver, 10000)
            dealSoup(driver, cate_obj['cate_name'], cate_obj['cate_url'], 0)
            #判断是否可以翻页
            go_status = True
            for i in range(1,cate_obj['total_page']):
                if go_status:
                    next_page_tag_list = driver.find_elements_by_css_selector('.page-item-next')
                    if len(next_page_tag_list) > 0:
                        next_page_tag_list[0].click()
                        time.sleep(3)
                        fullpage_screenshot(driver, 6000)
                        go_status = dealSoup(driver,cate_obj['cate_name'],cate_obj['cate_url'],i)
                    else:
                        go_status = False
        time.sleep(10)
        driver.quit()
        # 因为是每页的商品数据单独保存,,所以需要合并成一个
        connectToOne(save_dir, '/Users/xxx/Desktop', 'newchic.xlsx')
    

      

    最后:

    用到的其他方法,我就一次性粘贴了:

    # 模拟滚动
    def fullpage_screenshot(driver, total_height):
        total_width = driver.execute_script("return document.body.offsetWidth")
        # total_height = driver.execute_script("return document.body.parentNode.scrollHeight")
        # total_height = 50000
        viewport_width = driver.execute_script("return document.body.clientWidth")
        viewport_height = driver.execute_script("return window.innerHeight")
        rectangles = []
    
        i = 0
        while i < total_height:
            ii = 0
            top_height = i + viewport_height
    
            if top_height > total_height:
                top_height = total_height
    
            while ii < total_
                top_width = ii + viewport_width
    
                if top_width > total_
                    top_width = total_width
                rectangles.append((ii, i, top_width, top_height))
    
                ii = ii + viewport_width
    
            i = i + viewport_height
    
        previous = None
        part = 0
    
        for rectangle in rectangles:
            if not previous is None:
                driver.execute_script("window.scrollTo({0}, {1})".format(rectangle[0], rectangle[1]))
                time.sleep(0.2)
    
            file_name = "part_{0}.png".format(part)
    
            # driver.get_screenshot_as_file(file_name)
            if rectangle[1] + viewport_height > total_height:
                offset = (rectangle[0], total_height - viewport_height)
            else:
                offset = (rectangle[0], rectangle[1])
            part = part + 1
            previous = rectangle
        return True
    
    
    heads_0 = ['cate_name_all','cate_name', 'cate_url', 'product_link', 'product_desc','product_now_price','product_old_price']
    
    def objListToExcel(objlist,column_arr,out_path):
        df_data_source = {}
        for filed in column_arr:
            df_data_source[filed] = []
        if len(objlist) == 0:
            return 0
        for obj in objlist:
            for key_0 in column_arr:
                df_data_source[key_0].append(obj[key_0])
        df_data = pd.DataFrame(df_data_source,columns=column_arr)
        df_data.to_excel(out_path,index=False)
    
    
    
    def extractPriceNum(price_str):
        # 价格正则
        price_pattern = re.compile(r'[0-9]+.[0-9]{2}')
        price_num_arr = re.findall(price_pattern,price_str)
        if len(price_num_arr) > 0:
            return price_num_arr[0]
        else:
            return 'null'
    def extractNum(test_str):
        # 价格正则
        price_pattern = re.compile(r'[0-9]+')
        num_arr = re.findall(price_pattern,test_str)
        if len(num_arr) > 0:
            return int(num_arr[0])
        else:
            return 1
    
    
    
    def connectToOne(dir, to_dir, out_file_name):
        excel_list = []
        for file in os.listdir(dir):
            if file.endswith('.xlsx') and '.~' not in file :
                print("file:", file)
                excel_list.append(
                    pd.read_excel(os.path.join(dir, file), dtype={'cate_url': str, 'product_link': str}, ))
        print('开始合并')
        total_excel = pd.concat(excel_list)
        print('生成文件')
        writer = pd.ExcelWriter(os.path.join(to_dir, out_file_name), engine='xlsxwriter',
                                options={'strings_to_urls': False})
        print(os.path.join(to_dir, out_file_name), writer)
        total_excel.to_excel(writer, index=False)
        writer.close()
    

      

    ————————————————
    版权声明:本文为CSDN博主「blues_phone」的原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接及本声明。
    原文链接:https://blog.csdn.net/huangmengfeng/article/details/116146346

  • 相关阅读:
    [JSOI2010]解题报告+2010~2011小结
    有用的东西(emacs配置和bzoj数据下载网址)
    [JSOI2011]解题报告
    [JSOI2010]旅行题解
    [BOI2007]Mokia题解
    分块总结
    统计数字
    爬不出去的水井
    采药
    沙漠储油点
  • 原文地址:https://www.cnblogs.com/lelexiu/p/14734345.html
Copyright © 2011-2022 走看看