zoukankan      html  css  js  c++  java
  • 爬取淘宝商品信息

    from selenium import webdriver
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.common.by import By
    import re,json
    from pyquery import PyQuery as pq
    driver = webdriver.Ie()
    wait=WebDriverWait(driver,20,0.2)
    url = "http://taobao.com"
    def search():
        try:
            driver.get(url)
            # 判断页面是否加载成功
            input=wait.until(EC.presence_of_element_located((By.ID,"q")))
            input.send_keys("美食")
            submit=wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#J_TSearchForm > div.search-button > button")))
            submit.click()
            #总的页数
            total=wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#mainsrp-pager > div > div > div > div.total")))
    
            get_products()
    
            return total.text
        except TimeoutError:
            return search()
    
    def next_page(page_number):
        try:
            input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > input")))  #输入框
            submit = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit"))) #确定
            input.clear()
            input.send_keys(page_number)
            submit.click()
            wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,"#mainsrp-pager > div > div > div > ul > li.item.active > span"),str(page_number)))
    
            get_products()
    
        except TimeoutError:
            next_page(page_number)
    
    def get_products():
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#mainsrp-itemlist .items .item")))
        html=driver.page_source
        #进行解析
        doc=pq(html)
        items=doc('#mainsrp-itemlist .items .item').items()
        #字典
        for item in items:
            product={
                'image':item.find('.pic .img').attr('src'),#获取SRC属性
                'price':item.find('.price').text(),#商品价格
                'deal':item.find('.deal-cnt').text()[:-3],#成交量  去掉付款人三个字
                'title':item.find(".title").text(),#商品标题
                'shop':item.find(".shop").text(),#店名
                'location':item.find(".location").text()  #地址
    
            }
            print(product)
            save_data(product)
    
    def save_data(result):
        with open('淘宝商品信息','a+',encoding="utf-8")as f:
            f.write(json.dumps(result,ensure_ascii=False)+'
    ')
            f.close()
    
    
    
    def main():
        total=search()
        #只打印数字页数,强制转换为int类型
        total=int(re.compile('(d+)').search(total).group(1))
        print(total)
        for i in range(2,total+1):
            next_page(i)
        driver.quit()
    
    
    if __name__ == '__main__':
        main()
  • 相关阅读:
    新一代MQ apache pulsar的架构与核心概念
    Flutter使用fluwx实现微信分享
    BZOJ3622 已经没有什么好害怕的了 动态规划 容斥原理 组合数学
    NOIP2016提高组Day1T2 天天爱跑步 树链剖分 LCA 倍增 差分
    Codeforces 555C Case of Chocolate 其他
    NOIP2017提高组Day2T3 列队 洛谷P3960 线段树
    NOIP2017提高组Day2T2 宝藏 洛谷P3959 状压dp
    NOIP2017提高组Day1T3 逛公园 洛谷P3953 Tarjan 强连通缩点 SPFA 动态规划 最短路 拓扑序
    Codeforces 873F Forbidden Indices 字符串 SAM/(SA+单调栈)
    Codeforces 873E Awards For Contestants ST表
  • 原文地址:https://www.cnblogs.com/yaoliping/p/9630357.html
Copyright © 2011-2022 走看看