zoukankan html css js c++ java

白嫖爬虫了，代码可用，换成自己想要爬的网站再改一下网站解析那块代码

前言：大大小小的电商网站爬了不少。结论就是分两种类型：
第一：requests 直接获取

第二：网页动态加载，requests获取失败

直接分享代码吧

1.先导入需要的库和chromedriver的地址(爬动态加载的网页需要，若是requests可直接获取的网站可忽略)

import time,re,pandas as pd,os,requests
from selenium import webdriver
from bs4 import BeautifulSoup



CHROME_DRIVER_PATH = '/Users/xxxx/PycharmProjects/爬虫/chromedriver'

2.我先给出主函数，里面方法我会在下面贴出来

我爬的是电商网站，自然是爬去列表页的商品信息(商品描述，商品链接，商品售价，商品原价)

那么下面是爬静态网页的核心函数

#处理静态网页的
def dealSoup(now_soup,cate_name,cate_url,now_page_num):

    #获取有层级的分类
    cate_span_tag_list = now_soup.select('.category-breadcrumb  li ')
    cate_all_text = ''
    for span_tag in cate_span_tag_list:
        cate_all_text += f"{span_tag.text.strip()}"

    #获得页数
    total_page_num  = 1
    total_num_tag_list = now_soup.select('.site-pager li')
    if len(total_num_tag_list) == 0:
        pass
    elif len(total_num_tag_list) == 1:
        total_num_tag = total_num_tag_list[1]
        total_num = extractNum(total_num_tag.text)
        print(int(total_num))
        total_page_num = int(total_num)
    else:
        total_num_tag = total_num_tag_list[-2]
        total_num = extractNum(total_num_tag.text)
        print(int(total_num))
        total_page_num = int(total_num)

    #遍历全部商品
    tag_list = now_soup.select('.category-list div.item')
    if len(tag_list) > 0:
        print(len(tag_list))
        item_list = []
        for tag in tag_list:
            item = {
                'cate_name_all' : cate_all_text[:-1],
                'cate_name' : cate_name,
                'cate_url' : cate_url,
                'product_now_price' : 'null',
                'product_old_price' : 'null'
            }
            desc_tag = tag.select('.name > a')[0]

            price_tag_list = tag.select('.my-shop-price')


            item['product_desc'] = desc_tag.text.strip()
            item['product_link'] = desc_tag.attrs['href']
            if len(price_tag_list) > 0:
                item['product_now_price'] = price_tag_list[0].attrs['data-oprice']
                item['product_old_price'] = price_tag_list[0].attrs['data-oprice']
            if len(price_tag_list) > 1:
                item['product_old_price'] = price_tag_list[1].attrs['data-oprice']
            print(item)
            item_list.append(item)

        objListToExcel(item_list,heads_0,f"{save_dir}/{cate_name}_{now_page_num}.xlsx")
        return True,total_page_num
    else:
        return False,total_page_num



if __name__ == "__main__":
    #需要爬去的列表页链接
    #cate_url：列表页url
    #cate_name:你对这个列表页的分类定义
    ALL_CATE_LIST = [
        {'cate_url': 'https://www.adorawe.net/category/denim-pants-c_808.html',
         'cate_name': 'Pants1'},
        {'cate_url': 'https://www.adorawe.net/category/casual-pants-c_809.html',
         'cate_name': 'Pants'},
    ]
    #设置一个文件加用来存爬取的信息
    save_dir = '/Users/xxxx/Desktop/adorawe'
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)
    #开始爬列表页
    for cate_obj in ALL_CATE_LIST:
        #获得BeautifulSoup格式的网页文件
        soup = get_static_html(cate_obj['cate_url'])
        #处理网页，保存本页商品数据，获得该列表页的总页数
        go_status,page_num = dealSoup(soup, cate_obj['cate_name'], cate_obj['cate_url'], 1)
        #翻页爬取
        for i in range(1,page_num):
            body_url = cate_obj['cate_url'].replace('.html','')
            tmp_url = f"{body_url}-page-{i+1}.html"
            tmp_soup = get_static_html(tmp_url)
            go_status, page_num = dealSoup(tmp_soup, cate_obj['cate_name'], tmp_url, i+1)
    #因为是每页的商品数据单独保存,，所以需要合并成一个
    connectToOne(save_dir, '/Users/xxx/Desktop', 'adorawe.xlsx')

　　下面是爬动态网页的

#处理动态加载网页的
def dealSoup(driver,cate_name,cate_url,page_num):
    now_data = driver.page_source
    now_soup = BeautifulSoup(now_data, 'html.parser')

    #获取有层级的分类
    cate_span_tag_list = now_soup.select('ul.breadcrumb > li')
    cate_all_text = ''
    for cate_span in cate_span_tag_list:
        cate_all_text += f"{cate_span.text.strip()}/"

    #遍历全部商品
    tag_list = now_soup.select('div.product-list-container > .product-item')
    if len(tag_list) > 0:
        print(len(tag_list))
        item_list = []
        for tag in tag_list:
            item = {
                'cate_name_all' : cate_all_text[:-1],
                'cate_name' : cate_name,
                'cate_url' : cate_url,
                'product_now_price' : 'null',
                'product_old_price' : 'null'
            }
            desc_tag = tag.select('.product-item-name')[0]
            link_tag = desc_tag.select('a')[-1]
            final_price_tag_list = tag.select('.product-item-final-price-js')
            del_price_tag_list = tag.select('.product-item-del-price-js')

            item['product_desc'] = desc_tag.text.strip()
            item['product_link'] = link_tag.attrs['href']
            if len(final_price_tag_list) > 0:
                item['product_now_price'] = final_price_tag_list[0].text.strip()
                item['product_old_price'] = final_price_tag_list[0].text.strip()
            if len(del_price_tag_list) > 0:
                item['product_old_price'] = del_price_tag_list[0].text.strip()
            print(item)
            item_list.append(item)

        objListToExcel(item_list,heads_0,f"{save_dir}/{cate_name}_{page_num}.xlsx")
        return True
    else:
        return False



if __name__ == "__main__":
    # 需要爬去的列表页链接
    # cate_url：列表页url
    # cate_name:你对这个列表页的分类定义
    # total_page:这个列表页的总页数
    ALL_CATE_LIST = [
        {'cate_url': 'https://sea.newchic.com/pajamas-and-robes-c-4185/?country=188&SEA=0',
         'cate_name': 'Loungewear',
         'total_page': 9},
        {'cate_url': 'https://sea.newchic.com/womens-shoes-c-3592/?country=188&SEA=0',
         'cate_name': 'Shoes',
         'total_page': 62 },
    ]
    #设置一个文件加用来存爬取的信息
    save_dir = '/Users/xxx/Desktop/newchic'
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)
    #模拟浏览器打开网页
    site_url_0 = ALL_CATE_LIST[0]['cate_url']
    print('开始加载', site_url_0, '动态页面')
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('--ignore-ssl-errors')
    driver = webdriver.Chrome(executable_path=CHROME_DRIVER_PATH, chrome_options=chrome_options)
    driver.set_page_load_timeout(100)
    driver.set_window_size(1420, 780)
    driver.get(site_url_0)
    #由于是懒加载，需要模拟滚动屏幕，是页面加载全部的商品
    #第二个参数是滚动距离，根据爬取的页面调整大小，使得商品全部加载即可
    fullpage_screenshot(driver,10000)
    time.sleep(5)
    #处理该页面，并存储到本地
    dealSoup(driver, ALL_CATE_LIST[0]['cate_name'], ALL_CATE_LIST[0]['cate_url'], 0)
    #开始爬列表页
    for cate_obj in ALL_CATE_LIST:
        driver.get(cate_obj['cate_url'])
        fullpage_screenshot(driver, 10000)
        dealSoup(driver, cate_obj['cate_name'], cate_obj['cate_url'], 0)
        #判断是否可以翻页
        go_status = True
        for i in range(1,cate_obj['total_page']):
            if go_status:
                next_page_tag_list = driver.find_elements_by_css_selector('.page-item-next')
                if len(next_page_tag_list) > 0:
                    next_page_tag_list[0].click()
                    time.sleep(3)
                    fullpage_screenshot(driver, 6000)
                    go_status = dealSoup(driver,cate_obj['cate_name'],cate_obj['cate_url'],i)
                else:
                    go_status = False
    time.sleep(10)
    driver.quit()
    # 因为是每页的商品数据单独保存,，所以需要合并成一个
    connectToOne(save_dir, '/Users/xxx/Desktop', 'newchic.xlsx')

最后：

用到的其他方法，我就一次性粘贴了：

# 模拟滚动
def fullpage_screenshot(driver, total_height):
    total_width = driver.execute_script("return document.body.offsetWidth")
    # total_height = driver.execute_script("return document.body.parentNode.scrollHeight")
    # total_height = 50000
    viewport_width = driver.execute_script("return document.body.clientWidth")
    viewport_height = driver.execute_script("return window.innerHeight")
    rectangles = []

    i = 0
    while i < total_height:
        ii = 0
        top_height = i + viewport_height

        if top_height > total_height:
            top_height = total_height

        while ii < total_
            top_width = ii + viewport_width

            if top_width > total_
                top_width = total_width
            rectangles.append((ii, i, top_width, top_height))

            ii = ii + viewport_width

        i = i + viewport_height

    previous = None
    part = 0

    for rectangle in rectangles:
        if not previous is None:
            driver.execute_script("window.scrollTo({0}, {1})".format(rectangle[0], rectangle[1]))
            time.sleep(0.2)

        file_name = "part_{0}.png".format(part)

        # driver.get_screenshot_as_file(file_name)
        if rectangle[1] + viewport_height > total_height:
            offset = (rectangle[0], total_height - viewport_height)
        else:
            offset = (rectangle[0], rectangle[1])
        part = part + 1
        previous = rectangle
    return True


heads_0 = ['cate_name_all','cate_name', 'cate_url', 'product_link', 'product_desc','product_now_price','product_old_price']

def objListToExcel(objlist,column_arr,out_path):
    df_data_source = {}
    for filed in column_arr:
        df_data_source[filed] = []
    if len(objlist) == 0:
        return 0
    for obj in objlist:
        for key_0 in column_arr:
            df_data_source[key_0].append(obj[key_0])
    df_data = pd.DataFrame(df_data_source,columns=column_arr)
    df_data.to_excel(out_path,index=False)



def extractPriceNum(price_str):
    # 价格正则
    price_pattern = re.compile(r'[0-9]+.[0-9]{2}')
    price_num_arr = re.findall(price_pattern,price_str)
    if len(price_num_arr) > 0:
        return price_num_arr[0]
    else:
        return 'null'
def extractNum(test_str):
    # 价格正则
    price_pattern = re.compile(r'[0-9]+')
    num_arr = re.findall(price_pattern,test_str)
    if len(num_arr) > 0:
        return int(num_arr[0])
    else:
        return 1



def connectToOne(dir, to_dir, out_file_name):
    excel_list = []
    for file in os.listdir(dir):
        if file.endswith('.xlsx') and '.~' not in file :
            print("file:", file)
            excel_list.append(
                pd.read_excel(os.path.join(dir, file), dtype={'cate_url': str, 'product_link': str}, ))
    print('开始合并')
    total_excel = pd.concat(excel_list)
    print('生成文件')
    writer = pd.ExcelWriter(os.path.join(to_dir, out_file_name), engine='xlsxwriter',
                            options={'strings_to_urls': False})
    print(os.path.join(to_dir, out_file_name), writer)
    total_excel.to_excel(writer, index=False)
    writer.close()

————————————————
版权声明：本文为CSDN博主「blues_phone」的原创文章，遵循CC 4.0 BY-SA版权协议，转载请附上原文出处链接及本声明。
原文链接：https://blog.csdn.net/huangmengfeng/article/details/116146346

查看全文

相关阅读:
Linux下安装vmtools的语句
 [WP]BugkuCtf
Linux文件属性及权限
 学习pwn的前提工作及部分解决方案
 windows环境下MySQL mysql-5.7.17-winx64 (社区服务版，community server)安装教程
 ubuntu14.04 LTS 更新国内网易163源
 session cookie
java collection map
重温总结 maven几个重要概念
 java通信

原文地址：https://www.cnblogs.com/lelexiu/p/14734345.html