zoukankan      html  css  js  c++  java
  • 爬取京东商城某件商品信息

    通过Selenium抓取京东商城某件商品(如“ThinkPad”)的前3页的信息,包括:标题、价格、图片链接、评论数和商品名称,并将这些信息存储至数据库中。

    import time
    import pymysql
    from bs4 import BeautifulSoup
    from urllib.parse import quote
    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from selenium.webdriver.common.keys import Keys
    from selenium.common.exceptions import TimeoutException
    from selenium.webdriver.support.wait import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    
    browser = webdriver.Chrome()
    wait = WebDriverWait(browser, 10) #
    KEYWORD = 'ThinkPad'
    
    def index_page(page): #抓取商品列表页
        print('正在爬取第', page, '')
        try:
            url = 'https://search.jd.com/Search?keyword=' + quote(KEYWORD) #
            browser.get(url)
            time.sleep(1)
            browser.execute_script('window.scrollTo(0, document.body.scrollHeight)') #将进度条下拉到最底部
            time.sleep(2)
    
            wait.until(EC.presence_of_all_elements_located((By.XPATH, '//li[@class="gl-item"]')))
            
            if page > 1:
                input = wait.until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, '#J_bottomPage > span.p-skip > input')))
                input.clear()
                input.send_keys(page)
                input.send_keys(Keys.ENTER)
                
                time.sleep(1)
                browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')#将进度条下拉到最底部
                time.sleep(2)
                
                wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#J_bottomPage > span.p-num > a.curr'), str(page)))
                
            get_products()
        except TimeoutException:
    #         index_page(page)
            print('index_page: TimeoutException')
        
    def get_products():
        html = browser.page_source
        soup = BeautifulSoup(html, 'lxml')
        #标题、价格、图片链接、评论数和商品名称
        for li in soup.select('#J_goodsList li.gl-item'):            
            image = li.select('.p-img > a > img')[0]['data-lazy-img']
            if image == 'done':
                image = li.select('.p-img > a > img')[0].attrs['src'] #注意不要写成atrrs
            product = {
                'image': image,
                'title': li.select('.p-img a')[0].attrs['title'],
                'price': li.select('.p-price > strong')[0].get_text(),
                'commit': li.select('.p-commit > strong > a')[0].get_text(),
                'shop': li.select('.p-shop > span > a')[0].get_text(),
            }
            print(product)
            write_to_sql(product)
            
    def create_sql():
        db = pymysql.connect(host='localhost',user='root',password='123456',port=3306)
        cursor = db.cursor()
        cursor.execute("CREATE DATABASE spiders DEFAULT CHARACTER SET utf8") #创建数据库spiders
        db = pymysql.connect(host='localhost', user='root', password='123456', port=3306, db='spiders')
        cursor = db.cursor() 
        sql= "CREATE TABLE JD (image VARCHAR(255) NOT NULL,title VARCHAR(255) NOT NULL,price VARCHAR(255) NOT NULL,commit VARCHAR(255) NOT NULL,shop VARCHAR(255) NOT NULL)"
        cursor.execute(sql) #创建数据表JD
        db.close()
        
    def write_to_sql(data):
        table = 'JD'
        keys = ', '.join(data.keys())
        values = ', '.join(['%s'] * len(data))
        sql = 'INSERT INTO {table}({keys}) VALUES ({values}) ON DUPLICATE KEY UPDATE '.format(table=table, keys=keys, values=values)
        update = ','.join(["{key} = %s".format(key=key) for key in data])
        sql += update 
        db = pymysql.connect(host='localhost', user='root', password='123456', port=3306, db='spiders')
        cursor = db.cursor()
        try:
            if cursor.execute(sql, tuple(data.values())*2):
                print('Successful')
                db.commit()
        except:
            print('Failed')
            db.rollback()
        db.close()        
            
    def main():
        create_sql()
        for i in range(1, 4):
            index_page(i)
            time.sleep(5)
    
    main()
  • 相关阅读:
    ASP.NET CORE3.0 API Swagger+IdentityServer4授权验证
    Ubuntu16.04安装RabbitMq并设置用户
    使用sql实现表90度旋转(矩阵转置)
    HDWiKi新架设网站打开慢原因排查1.4秒→0.03秒
    手机端网页设置了csswidth:100%,但不能显示完整的问题
    SQL Server 2008 收缩日志
    MSSQL查看一个数据库中所有表的行数
    那就简单说说这个服务器吧,题外话。
    win7单独安装php
    css兼容写法
  • 原文地址:https://www.cnblogs.com/oeong/p/11823600.html
Copyright © 2011-2022 走看看