zoukankan      html  css  js  c++  java
  • 用selenium爬取淘宝美食

    '''利用selenium爬取淘宝美食网页内容'''
    
    import re
    from selenium import webdriver
    from selenium.common.exceptions import TimeoutException
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from pyquery import PyQuery as pq
    from config import *
    
    driver = webdriver.PhantomJS(service_args=SERVICE_ARGS)
    # driver = webdriver.Chrome()
    wait = WebDriverWait(driver, 10)
    
    driver.set_window_size(1400,900)            #有这这句话就是可以爬取到网页的内容,没有的话就出现TimeOut错误
    
    def search():
        print('正在搜索')
        try:
            driver.get('http://www.taobao.com')
            s_input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#q')))
            sumbit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#J_TSearchForm > div.search-button > button')))
            s_input.send_keys(KEYWORD)
            sumbit.click()
            totle = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.total')))
            get_products()
            return totle.text
        except TimeoutException:
            print('TimeOut')
            return search()
    
    def next_page(page_number):
        print('正在翻页', page_number)
        try:
            s_input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > input')))
            sumbit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit')))
            s_input.clear()
            s_input.send_keys(page_number)
            sumbit.click()
            wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > ul > li.item.active > span'),str(page_number)))
            get_products()
        except TimeoutException:
            print('TimeOut')
            next_page(page_number)
    
    def get_products():
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-itemlist .items .item')))
        html = driver.page_source
        doc = pq(html)
        items = doc('#mainsrp-itemlist .items .item').items()
        for item in items:
            product = {
                'image': item.find('.pic .img').attr('src'),
                'price':item.find('.price').text(),
                'deal': item.find('.deal-cnt').text()[:-3],
                'title': item.find('.title').text(),
                'shop': item.find('.shop').text(),
                'location': item.find('.location').text()
            }
            print(product)
    
    
    def main():
        try:
            totle = search()
            totle = int(re.compile('(d+)').search(totle).group(1))
            for num in range(2,totle + 1):
                next_page(num)
        except Exception as e:
            print(e)
        finally:        #最后执行的操作
            driver.close()
    
    if __name__  == '__main__':
        main()
    View Code

    config文件

    SERVICE_ARGS = ['--load-images=false', '--disk-cache=true']
    KEYWORD = '美食'
    View Code
  • 相关阅读:
    Restful接口传入多参数
    map转换成JSON的3种方式
    项目打包后执行start.sh提示“no such file or directory”解决办法,linux中给文件增加权限
    如何将一个a表a1字段与b表b1字段的笛卡尔积插入到e表中
    get、set方法的取代注释之lombok插件
    推荐 33 个 IDEA 最牛配置转(Java技术栈)
    mysql-----group by 对多个字段进行分组
    mysql获取某段时间内每一天的统计数据
    发票流水号生成方式
    postman之post请求传参
  • 原文地址:https://www.cnblogs.com/114811yayi/p/7226206.html
Copyright © 2011-2022 走看看