zoukankan      html  css  js  c++  java
  • 爬取淘宝商品信息

    from selenium import webdriver
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.common.by import By
    import re,json
    from pyquery import PyQuery as pq
    driver = webdriver.Ie()
    wait=WebDriverWait(driver,20,0.2)
    url = "http://taobao.com"
    def search():
        try:
            driver.get(url)
            # 判断页面是否加载成功
            input=wait.until(EC.presence_of_element_located((By.ID,"q")))
            input.send_keys("美食")
            submit=wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#J_TSearchForm > div.search-button > button")))
            submit.click()
            #总的页数
            total=wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#mainsrp-pager > div > div > div > div.total")))
    
            get_products()
    
            return total.text
        except TimeoutError:
            return search()
    
    def next_page(page_number):
        try:
            input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > input")))  #输入框
            submit = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit"))) #确定
            input.clear()
            input.send_keys(page_number)
            submit.click()
            wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,"#mainsrp-pager > div > div > div > ul > li.item.active > span"),str(page_number)))
    
            get_products()
    
        except TimeoutError:
            next_page(page_number)
    
    def get_products():
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#mainsrp-itemlist .items .item")))
        html=driver.page_source
        #进行解析
        doc=pq(html)
        items=doc('#mainsrp-itemlist .items .item').items()
        #字典
        for item in items:
            product={
                'image':item.find('.pic .img').attr('src'),#获取SRC属性
                'price':item.find('.price').text(),#商品价格
                'deal':item.find('.deal-cnt').text()[:-3],#成交量  去掉付款人三个字
                'title':item.find(".title").text(),#商品标题
                'shop':item.find(".shop").text(),#店名
                'location':item.find(".location").text()  #地址
    
            }
            print(product)
            save_data(product)
    
    def save_data(result):
        with open('淘宝商品信息','a+',encoding="utf-8")as f:
            f.write(json.dumps(result,ensure_ascii=False)+'
    ')
            f.close()
    
    
    
    def main():
        total=search()
        #只打印数字页数,强制转换为int类型
        total=int(re.compile('(d+)').search(total).group(1))
        print(total)
        for i in range(2,total+1):
            next_page(i)
        driver.quit()
    
    
    if __name__ == '__main__':
        main()
  • 相关阅读:
    js--script和link中的 integrity 属性
    html之aria-hidden="true"
    S-HR之导入模板校验非当天变更限制
    拆分字符串
    S-HR之变动操作,变动原因,变动类型/离职操作,离职原因,离职类型
    mysql数学函数
    html页面引入
    springboot之server属性
    【开发工具】-Idea代码提示忽略大小写
    【个人提升】-怎样获得技术的成长(转)
  • 原文地址:https://www.cnblogs.com/yaoliping/p/9630357.html
Copyright © 2011-2022 走看看