zoukankan      html  css  js  c++  java
  • 爬虫技术:爬取淘宝美食数据:崔庆才思路

    # TODO selenium已经被检测出来

    import random
    import re
    import time
    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    
    
    driver = webdriver.Chrome()
    
    def search():
        """执行后跳转到登录界面--手动登录,且wait容易引起超时错误,因此需要捕捉"""
        try:
            driver.get("https://www.taobao.com/")
            # 获取输入框
            input = WebDriverWait(driver, 15).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "#q"))
            )
            # 获取搜索按钮,TODO 为什么不用id选择器呢?
            submit = WebDriverWait(driver,15).until(EC.element_to_be_clickable((By.CSS_SELECTOR,"#J_TSearchForm > div.search-button > button")))
            input.send_keys("美食")
            time.sleep(1)
            submit.click()
            # 获取全部页数
            total = WebDriverWait(driver,15).until(EC.presence_of_element_located((By.CSS_SELECTOR,"#mainsrp-pager > div > div > div > div.total")))
            return total.text
        except TimeoutError:
            return search()
    
    def next_page(page):
        try:
            # 获取第几页输入框
            input = WebDriverWait(driver, 15).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > input"))
            )
            # 获取确定按钮
            submit = WebDriverWait(driver,15).until(EC.element_to_be_clickable((By.CSS_SELECTOR,"#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit")))
            input.clear()
            input.send_keys(page)
            # time.sleep(1)  # 1.解决方式:翻页翻到34页,出现滑块验证,系统检测出来了,是自动化操作,时间不能固定
            second = random.randint(1,6) # 设置随机时间后,第一次就要进行人工验证,# TODO 淘宝验证出selenium的方式是啥?
            time.sleep(second)
            submit.click()
            # 判断条件:当前高亮文本的内容和页码参数是否相同
            WebDriverWait(driver,15).until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,"#mainsrp-pager > div > div > div > ul > li.item.active > span"),str(page)))
        except TimeoutError:
            return next_page(page)
        
    
    
    if __name__ == '__main__':
        total = search()  # 共 100 页,
        pattern = re.compile("(d+)",re.S)
        total_page = re.search(pattern,total)
        total_page = int(total_page.group(1))
        for i in range(2, total_page + 1):
            next_page(i)
  • 相关阅读:
    机器分配
    搭建免费私有音乐云
    ngnix相关
    idea常用插件
    notepad++ 实用插件
    liunx 新建自启服务
    scala 语法特性小计
    spring boot 静态资源 访问 配置
    SVN-Unable to create pristine install stream
    idea 编译 错误 Error:java: Compilation failed: internal java compiler error 解决方案
  • 原文地址:https://www.cnblogs.com/meloncodezhang/p/11564049.html
Copyright © 2011-2022 走看看